In [311]:
import pandas as pd
import numpy as np
import re
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import pairwise_distances
import nltk
import html
import string
import ast
from textblob import TextBlob
from sklearn.externals import joblib

#### import data

In [312]:
event_name = input('Enter Event Name: ')
event_filename = re.sub("\W+", "", event_name.strip())

Enter Event Name: Xbox E3


In [313]:
df = pd.read_csv('data/final/event_%s_data.txt' % event_filename, sep='\t', encoding='utf-8', header=0, parse_dates=['created_at'], dtype={'twitter_id' : 'str'})
df.shape

(86475, 36)

In [314]:
dfsubevents = pd.read_csv('data/final/event_%s_subevents.txt' % event_filename, sep='\t', encoding='utf-8', header=0, index_col=0)
dfsubevents.index = dfsubevents.index.to_datetime()
dfsubevents.shape

(1440, 5)

In [315]:
df_e_subevents = pd.read_csv('data/final/event_%s_e_subevents.txt' % event_filename, sep='\t', encoding='utf-8', header=0, index_col=0)
df_e_subevents.index = df_e_subevents.index.to_datetime()
df_e_subevents.shape

(1440, 7)

In [316]:
k = 25

In [317]:
allsubevents = pd.merge(dfsubevents[['count', 'mean', 'deviation', 'rank_subevents']], df_e_subevents[['count+', 'rank_subevents']], how='inner', left_index=True, right_index=True)

In [318]:
dftemp = df[['twitter_id', 'created_at']]
dftemp['created_at'] = dftemp['created_at'].apply(lambda x: x.replace(second=0))
dftemp.set_index(['created_at'], inplace=True)

data = pd.merge(dftemp, allsubevents, how='inner', left_index=True, right_index=True)
data.set_index('twitter_id', inplace=True)

data = pd.merge(df.set_index('twitter_id'), data, how='left', left_index=True, right_index=True)

data = data[(data['rank_subevents_x'] <= k) | (data['rank_subevents_y'] <= k)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [319]:
data.shape

(25069, 41)

### create features

In [320]:
data['text_clean'] = data['text'].apply(lambda text: ' '.join([(w[:w.find('http')] if 'http' in w else w) for w in html.unescape(str(text)).replace('#', '').replace('@', '').split()]))
tknzr = nltk.TweetTokenizer()
data['text_clean'] = data['text_clean'].apply(lambda text: ' '.join([w.lower() for w in tknzr.tokenize(text)]))

In [321]:
data['source_web_client'] = data['tweet_type'].apply(lambda s: [0,1][s=='Twitter Web Client'])
data['source_tweetdeck'] = data['tweet_type'].apply(lambda s: [0,1][s=='TweetDeck'])
data['source_iphone'] = data['tweet_type'].apply(lambda s: [0,1][s=='Twitter for iPhone'])
data['source_android'] = data['tweet_type'].apply(lambda s: [0,1][s=='Twitter for Android'])
data['has_location'] = data['longitude'].apply(lambda l: [1,0][np.isnan(l)])

In [322]:
data['tweet_type'] = data['tweet_type'].apply(lambda d: [0,1][d=='media'])
data['possibly_sensitive'] = data['possibly_sensitive'].apply(lambda d: [0,1][d==True])
data['count_entities_media'] = data['entities_media'].apply(lambda media: len(media))
data['media_per_word'] = data.apply(lambda row: row['count_entities_media'] / len(row['text'].split()), axis=1)
data['count_entities_urls'] = data['entities_urls'].apply(lambda urls: len(urls))
data['urls_per_word'] = data.apply(lambda row: row['count_entities_urls'] / len(row['text'].split()), axis=1)
data['count_entities_mentions'] = data['entities_mentions'].apply(lambda mentions: len(mentions))
data['mentions_per_word'] = data.apply(lambda row: row['count_entities_mentions'] / len(row['text'].split()), axis=1)
data['count_entities_hashtags'] = data['entities_hashtags'].apply(lambda tags: len(tags))
data['hashtags_per_word'] = data.apply(lambda row: row['count_entities_hashtags'] / len(row['text'].split()), axis=1)

In [323]:
data['engagements'] = data['favorite_count'] + data['retweet_count']
data['engagements_per_word'] = data.apply(lambda row: row['engagements'] / len(row['text'].split()), axis=1)
data['favorite_count'] = data.groupby('event')['favorite_count'].transform(lambda x: x / x.max())
data['retweet_count'] = data.groupby('event')['retweet_count'].transform(lambda x: x / x.max())
data['engagements'] = data.groupby('event')['engagements'].transform(lambda x: x / x.max())

In [324]:
data['user_verified'] = data['user_verified'].apply(lambda d: [0,1][d==True])
data['user_default_profile'] = data['user_default_profile'].apply(lambda d: [0,1][d==True])
data['user_default_profile_image'] = data['user_default_profile_image'].apply(lambda d: [0,1][d==True])
data['user_reputation'] = data['user_followers'] / (data['user_friends'])
data['user_reputation'].replace(np.inf, np.nan, inplace=True)
data['user_bio_len'] = data['user_description'].apply(lambda bio: len(str(bio)))
data['user_age_days'] = data.apply(lambda doc: (doc['created_at'] - pd.to_datetime(doc['user_created_at'])).days, axis=1)
data['user_follower_rate'] = data['user_followers'] / data['user_age_days']
data['user_follower_rate'].replace(np.inf, np.nan, inplace=True)

In [325]:
stop = nltk.corpus.stopwords.words('english')
punct = list(string.punctuation)
punct.extend(['...', '..', '…', '”', '“', '.@', 'RT'])
stop.extend(punct)
elongation = re.compile("([a-zA-Z])\\1{2,}")
data['count_characters'] = data['text'].apply(lambda text: len(str(text)))
data['count_non_characters'] = data['text'].apply(lambda text: len(re.sub('[\w+!@#$%&;:,.?\/\-“”’`"\'()|]', '', text).strip()))
data['count_upper'] = data['text'].apply(lambda text: len([l for l in ' '.join([w for w in text.split() if not w.startswith(('#', '@'))]) if l.isupper()]))
data['count_tokens'] = data['text_clean'].apply(lambda text: len(text.split()))
data['mean_token_length'] = data['text_clean'].apply(lambda text: np.mean([len(t) for t in text.split()]))
data['mean_token_frequency'] = data['text_clean'].apply(lambda text: np.mean(list(Counter(text.split()).values())))
data['count_tokens_stopped'] = data['text_clean'].apply(lambda text: len([t for t in text.split() if t not in stop]))
data['count_stops'] = data['text_clean'].apply(lambda text: len([t for t in text.split() if t in stop]))
data['count_question_marks'] = data['text_clean'].apply(lambda text: text.split().count('?'))
data['elongation'] = data['text_clean'].apply(lambda text: [0,1][bool(elongation.search(text))])
data['ellipsis'] = data['text_clean'].apply(lambda text: 1 if any(x in text for x in ('...', '…')) else 0)
data['lexical_diversity'] = data['text_clean'].apply(lambda text: len(set(text.split())) / len(text.split()))
sfpp = ['i', 'i\'m', 'me', 'mine', 'my', 'myself']
data['sfpp'] = data['text_clean'].apply(lambda text: 1 if any(t.lower() in sfpp for t in text.split()) else 0)
pfpp = ['we', 'we\'re', 'ours', 'our', 'ourselves']
data['pfpp'] = data['text_clean'].apply(lambda text: 1 if any(t.lower() in pfpp for t in text.split()) else 0)

In [326]:
data['text_sentiment_polarity'] = data['text_clean'].apply(lambda text: TextBlob(str(text)).sentiment.polarity)
data['text_sentiment_positive'] = data['text_sentiment_polarity'].apply(lambda s: [0,abs(s)][s > 0])
data['text_sentiment_negative'] = data['text_sentiment_polarity'].apply(lambda s: [0,abs(s)][s < 0])
data['text_sentiment_subjectivity'] = data['text_clean'].apply(lambda text: TextBlob(str(text)).sentiment.subjectivity)

In [327]:
data.rename(columns={'count' : 'frequency_1min',
                     'mean' : 'rollmean_frequency_win5min',
                     'deviation' : 'rollmean_frequency_deviation'},
         inplace=True)

In [328]:
%%time
data['media_weight'] = np.nan
data['url_weight'] = np.nan
data['mention_weight'] = np.nan
data['hashtag_weight'] = np.nan
data['term_weight'] = np.nan
data['tfidf_mean'] = np.nan
data['event_centroid_distance'] = np.nan

dfevent = df[['twitter_id', 'text', 'entities_media', 'entities_urls', 'entities_mentions', 'entities_hashtags']]

#clean text
dfevent['text_clean'] = dfevent['text'].apply(lambda text: ' '.join([(w[:w.find('http')] if 'http' in w else w) for w in html.unescape(str(text)).replace('#', '').replace('@', '').split()]))
dfevent['text_clean'] = dfevent['text_clean'].apply(lambda text: ' '.join([w.lower() for w in tknzr.tokenize(text)]))

#media weight
entities_lists = dfevent['entities_media'].apply(lambda entities: ast.literal_eval(entities)).values
allentities = [e.lower() for elist in entities_lists for e in elist]
entity_counts = Counter(allentities)
dfevent['media_weight'] = dfevent['entities_media'].apply(lambda entities: sum([entity_counts.get(e.lower()) for e in ast.literal_eval(entities)]))
dfevent['media_weight'] = dfevent['media_weight'] / dfevent['media_weight'].max()
#dfevent['media_weight'] = (dfevent['media_weight'] - dfevent['media_weight'].mean()) / dfevent['media_weight'].std()

#url weight
entities_lists = dfevent['entities_urls'].apply(lambda entities: ast.literal_eval(entities)).values
allentities = [e.lower() for elist in entities_lists for e in elist]
entity_counts = Counter(allentities)
dfevent['url_weight'] = dfevent['entities_urls'].apply(lambda entities: sum([entity_counts.get(e.lower()) for e in ast.literal_eval(entities)]))
dfevent['url_weight'] = dfevent['url_weight'] / dfevent['url_weight'].max()
#dfevent['url_weight'] = (dfevent['url_weight'] - dfevent['url_weight'].mean()) / dfevent['url_weight'].std()

#mention weight
entities_lists = dfevent['entities_mentions'].apply(lambda entities: ast.literal_eval(entities)).values
allentities = [e.lower() for elist in entities_lists for e in elist]
entity_counts = Counter(allentities)
dfevent['mention_weight'] = dfevent['entities_mentions'].apply(lambda entities: sum([entity_counts.get(e.lower()) for e in ast.literal_eval(entities)]))
dfevent['mention_weight'] = dfevent['mention_weight'] / dfevent['mention_weight'].max()
#dfevent['mention_weight'] = (dfevent['mention_weight'] - dfevent['mention_weight'].mean()) / dfevent['mention_weight'].std()

#hashtag weight
entities_lists = dfevent['entities_hashtags'].apply(lambda entities: ast.literal_eval(entities)).values
allentities = [e.lower() for elist in entities_lists for e in elist]
entity_counts = Counter(allentities)
dfevent['hashtag_weight'] = dfevent['entities_hashtags'].apply(lambda entities: sum([entity_counts.get(e.lower()) for e in ast.literal_eval(entities)]))
dfevent['hashtag_weight'] = dfevent['hashtag_weight'] / dfevent['mention_weight'].max()
#dfevent['hashtag_weight'] = (dfevent['hashtag_weight'] - dfevent['hashtag_weight'].mean()) / dfevent['mention_weight'].std()

#term weight
tweets = [str(d) for d in dfevent['text_clean']]
tokens = [w for t in tweets for w in t.split() if w not in stop] 
token_counts = Counter(tokens)
dfevent['term_weight'] = dfevent['text_clean'].apply(lambda text: sum([token_counts.get(t) for t in text.split() if t not in stop]))
dfevent['term_weight'] = dfevent['term_weight'] / dfevent['term_weight'].max()
#dfevent['term_weight'] = (dfevent['term_weight'] - dfevent['term_weight'].mean()) / dfevent['term_weight'].std()
    
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=500)
xtfidf = tfidf_vectorizer.fit_transform(dfevent['text_clean'])

#tfidf mean
xtfidf_means = xtfidf.mean(axis=1)
xtfidf_means = pd.DataFrame(xtfidf_means, columns=['tfidf_mean'])
xtfidf_means['tfidf_mean'] = xtfidf_means['tfidf_mean'] / xtfidf_means['tfidf_mean'].max()
dfevent = pd.merge(dfevent, xtfidf_means, how='inner', left_index=True, right_index=True)

#tfidf centroid distance
xtfidf_centroid = xtfidf.mean(axis=0)
xtfidf_cosdistance = pairwise_distances(X=xtfidf, Y=xtfidf_centroid, metric='cosine')
xtfidf_cosdistance = pd.DataFrame(xtfidf_cosdistance, columns=['event_centroid_distance'])
xtfidf_cosdistance['event_centroid_distance'] = xtfidf_cosdistance['event_centroid_distance'] / xtfidf_cosdistance['event_centroid_distance'].max()
dfevent = pd.merge(dfevent, xtfidf_cosdistance, how='inner', left_index=True, right_index=True)

#set index back to twitter id
dfevent.set_index('twitter_id', inplace=True)

#update columns in dataframe, on index (twitter_id)
data.update(dfevent)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

Wall time: 25.7 s


#### features

In [329]:
x = [
 'tweet_type',
 'possibly_sensitive',
 'user_default_profile',
 'user_default_profile_image',
 'user_verified',
 'user_statuses',
 'user_favourites',
 'user_followers',
 'user_friends',
 'user_listed',
 'source_web_client',
 'source_tweetdeck',
 'source_iphone',
 'source_android',
 'has_location',
 'count_entities_media',
 'media_per_word',
 'count_entities_urls',
 'urls_per_word',
 'count_entities_mentions',
 'mentions_per_word',
 'count_entities_hashtags',
 'hashtags_per_word',
 'favorite_count',
 'retweet_count',
 'engagements',
 'engagements_per_word',
 'user_bio_len',
 'user_reputation',
 'user_age_days',
 'user_follower_rate',
 'count_characters',
 'count_non_characters',
 'count_upper',
 'count_tokens',
 'mean_token_length',
 'mean_token_frequency',
 'count_tokens_stopped',
 'count_stops',
 'count_question_marks',
 'elongation',
 'ellipsis',
 'lexical_diversity',
 'sfpp',
 'pfpp',
 'text_sentiment_positive',
 'text_sentiment_negative',
 'text_sentiment_subjectivity',
 'frequency_1min',
 'rollmean_frequency_win5min',
 'rollmean_frequency_deviation',
 'media_weight',
 'url_weight',
 'mention_weight',
 'hashtag_weight',
 'term_weight',
 'tfidf_mean',
 'event_centroid_distance'
]

print('FEATURES: %s' % len(x))

FEATURES: 58


In [330]:
data[x] = data[x].apply(lambda x: x / x.max(), axis=0)

In [331]:
data[x] = data[x].fillna(0)
data[x] = data[x].replace(np.inf, 0)

#### import classifier

In [332]:
clf = joblib.load('clf/clfCUSTOM_train_Mix_test_Mix_RandomForestClassifier.pkl')

In [333]:
kbestfeatures = pd.read_csv('clf/kbestfeatures_CUSTOM_train_Mix_test_Mix_RandomForestClassifier.txt', sep='\t', header=0)

In [334]:
xkbest = list(kbestfeatures['feature'].values)

In [335]:
data['news'] = clf.predict(data[xkbest])

#### set news as 0

In [336]:
df['news'] = np.nan

#### update df with classified news

In [337]:
df.set_index('twitter_id', inplace=True)

In [338]:
df.update(data['news'])

In [339]:
df.reset_index(inplace=True)

#### export

In [340]:
df.shape

(86475, 36)

In [341]:
event_filename = re.sub("\W+", "", event_name.strip())
df.to_csv('data/final/event_%s_data.txt' % event_filename, sep='\t', encoding='utf-8', header=True, index=False)