In [36]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import random
from sklearn.metrics.pairwise import cosine_similarity
import math
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import psycopg2 as pg2
import psycopg2.extras as pgex
this_host='34.211.59.66'
this_user='postgres'
this_password='postgres'
conn = pg2.connect(host = this_host, 
                        user = this_user,
                        password = this_password)

sql_select = '''select created_at, location, tweet_content, cleaned_tweet, hashtags from tweets where hashtags != 'None';'''

cur = conn.cursor(cursor_factory=pgex.RealDictCursor)
cur.execute(sql_select)
rows = cur.fetchall()
conn.close()
df = pd.DataFrame(rows)
df.reset_index(inplace = True)

In [14]:
df['created_datetime'] = pd.to_datetime(df['created_at'])
df['year'] = df.created_datetime.apply(lambda x: x.year)
df['month'] = df.created_datetime.apply(lambda x: x.month)
df['day'] = df.created_datetime.apply(lambda x: x.day)
df['dayofweek'] = df.created_datetime.apply(lambda x: x.dayofweek)
df['hour'] = df.created_datetime.apply(lambda x: x.hour)

In [6]:
import pickle
!pip install redis
import redis
redis_ip = '34.211.59.66'
r = redis.StrictRedis(redis_ip)
r.keys()

[33mYou are using pip version 8.1.2, however version 9.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


[b'hashtags_tfidf_fit_transform',
 b'tweet_SVD_fit',
 b'tweet_tfidf_fit',
 b'tweet_tfidf_fit_transform']

In [11]:
hastages_series = df['hashtags'][df['hashtags'].isnull() == False]
len(hastages_series)

118746

In [41]:
count_vectorizer = CountVectorizer(min_df = 1)
hashtags_countvec_fit = count_vectorizer.fit(hastages_series)

In [42]:
hashtags_countvec = pickle.dumps(hashtags_countvec_fit)
r.set('hashtags_countvec_fit', hashtags_countvec)
r.keys()

[b'hashtags_countvec_fit_transform',
 b'hashtags_tfidf_fit_transform',
 b'tweet_SVD_fit',
 b'tweet_tfidf_fit',
 b'hashtags_countvec_fit',
 b'tweet_tfidf_fit_transform']

In [21]:
min(df['created_datetime']), max(df['created_datetime'])

(Timestamp('2017-05-31 23:50:46'), Timestamp('2017-06-07 14:24:48'))

In [None]:
min(df['created_datetime']) + timedelta(hours = 6)

In [37]:
time_delta = max(df['created_datetime']) - min(df['created_datetime'])
time_window = time_delta.components.days*24 + time_delta.components.hours
time_lag = timedelta(hours = 12)
time_gap = timedelta(hours = 6)

In [45]:
start_time = min(df['created_datetime'])
start_time = start_time
end_time = start_time + time_lag
subset = df[((df['created_datetime'] < end_time) & (df['created_datetime'] > start_time))]
print(len(subset))
hashtag_vec = hashtags_countvec_fit.transform(subset)
print(hashtag_vec.shape)

3767
(12, 54522)


In [None]:
start_time = min(df['created_datetime'])
for i in range(time_window):
    start_time = start_time
    end_time = start_time + time_lag
    subset = df[((df['created_datetime'] < end_time) & (df['created_datetime'] > start_time))]
    hashtags_countvec_fit.transform(subset)

In [None]:
def create_document_term_matrix(vectorizer, corpus):
    document_term_matrix = vectorizer.fit_transform(corpus)
    document_term_matrix = pd.DataFrame(document_term_matrix.toarray(),
                                        index=corpus,
                                        columns=vectorizer.get_feature_names ())
    return document_term_matrix

In [None]:
count_vectorizer = CountVectorizer(min_df = 1)
document_term_matrix = create_document_term_matrix(count_vectorizer,hastages_series)
document_term_matrix.head()

In [None]:
TFIDF_fit = pickle.loads(r.get('tweet_tfidf_fit'))
SVD_fit = pickle.loads(r.get('tweet_SVD_fit'))

In [None]:
def tweets_event_ab_test(event, n = 200):
    event_tweet = df[df['cleaned_tweet'].str.contains((event))]
    event_tweet.reset_index(inplace = True)
    
    A_index = list(range(event_tweet.shape[0]))
    random.shuffle(A_index)
    
    n = n
    A1_index = []
    for i in range(n):
        A1_index.append(A_index.pop())
    
    A1 = event_tweet.iloc[A1_index,:]
    A1_vec = np.array([SVD_fit.transform(TFIDF_fit.transform([i])).ravel() for i in A1['cleaned_tweet']])

    A2 = event_tweet.iloc[A_index,:] 
    #A2.reset_index(inplace = True)
    A2_vec = np.array([SVD_fit.transform(TFIDF_fit.transform([i])).ravel() for i in A2['cleaned_tweet']])
    non_event_tweet = df[~df['cleaned_tweet'].str.contains(('event'))]
    non_event_tweet.reset_index(inplace = True)
    
    B_index = list(range(non_event_tweet.shape[0]))
    random.shuffle(B_index)
    n = n
    B1_index = []
    for i in range(n):
        B1_index.append(B_index.pop())
    m = A2.shape[0]
    B2_index = []
    for i in range(m):
        B2_index.append(B_index.pop())  
    
    
    B1 = non_event_tweet.iloc[B1_index,:]
    #B1.reset_index(inplace = True)
    B1_vec = np.array([SVD_fit.transform(TFIDF_fit.transform([i])).ravel() for i in B1['cleaned_tweet']])
    
    B2 = non_event_tweet.iloc[B2_index,:]
    #B2.reset_index(inplace = True)
    B2_vec = np.array([SVD_fit.transform(TFIDF_fit.transform([i])).ravel() for i in B2['cleaned_tweet']])
    
    A1_vec_mean = np.mean(A1_vec, axis=0)
    A2_vec_mean = np.mean(A2_vec, axis=0)
    B1_vec_mean = np.mean(B1_vec, axis=0)
    B2_vec_mean = np.mean(B2_vec, axis=0)
    
    a1a2 = cosine_similarity(A1_vec_mean.reshape(1,-1),A2_vec_mean.reshape(1,-1))[0][0]
    b1b2 = cosine_similarity(B2_vec_mean.reshape(1,-1),B1_vec_mean.reshape(1,-1))[0][0]
    a1b1 = cosine_similarity(A1_vec_mean.reshape(1,-1),B1_vec_mean.reshape(1,-1))[0][0]
    a2b2 = cosine_similarity(A2_vec_mean.reshape(1,-1),B2_vec_mean.reshape(1,-1))[0][0]
   
    a2_consim_list = []
    for i in range(min(n,m)):  
        a2_consim_list.append(cosine_similarity(A2_vec[i].reshape(1,-1),A1_vec_mean.reshape(1,-1))[0][0])
    a2_mean = np.mean(np.array(a2_consim_list))
    a2_std = np.std(np.array(a2_consim_list))
    
    b2_consim_list = []
    for i in range(min(n,m)):  
        b2_consim_list.append(cosine_similarity(B2_vec[i].reshape(1,-1),B1_vec_mean.reshape(1,-1))[0][0])
    b2_mean = np.mean(b2_consim_list)
    b2_std = np.std(b2_consim_list)
    
    a1b2_consim_list = []
    for i in range(min(n,m)):  
        a1b2_consim_list.append((cosine_similarity(B2_vec[i].reshape(1,-1),A1_vec_mean.reshape(1,-1))[0][0]))
    a1b2_mean = np.mean(a1b2_consim_list)
    a1b2_std = np.std(a1b2_consim_list)
    
    print('A1|A2: ',a1a2,'\n'
                'B1|B2: ',b1b2, '\n\n'
                'A1|B1: ',a1b1, '\n'
                'A2|B2: ', a2b2, '\n\n'
                'Cosine Similarity Mean of A2 to A1', a2_mean, '\n'
                'Cosine Similarity Mean of B2 to B1', b2_mean, '\n'
                'Cosine Similarity Mean of B2 to A1', a1b2_mean, '\n\n'
                'Cosine Similarity STD of A2 to A1', a2_std, '\n'
                'Cosine Similarity STD of B2 to B1', b2_std, '\n'
                'Cosine Similarity STD of B2 to A1', a1b2_std, '\n'                 
                )
    
    return a2_consim_list, b2_consim_list

In [None]:
a2_consim_list, b2_consim_list = tweets_event_ab_test('paris|climate')

In [None]:
plt.scatter(range(len(a2_consim_list)), a2_consim_list)

In [None]:
plt.scatter(range(len(b2_consim_list)), b2_consim_list)

In [None]:
def event_tweet_count(event):
    event_tweet = df[df['cleaned_tweet'].str.contains((event))]
    return len(event_tweet)

In [None]:
event_tweet_count('nbafinal')

In [None]:
tweets_event_ab_test('nbafinal', n = 200)

In [None]:
tweets_event_ab_test('travel ban', n = 100)

In [None]:
def Top_tweets_in_b(event, n = 200):
    A = df[df['cleaned_tweet'].str.contains((event))]
    A.reset_index(inplace = True)
    A_vec = np.array([nlp(i).vector for i in A['cleaned_tweet']])
    A_vec_mean = np.average(A_vec, axis=0)
    
    
    B = df[~df['cleaned_tweet'].str.contains(('event'))]
    B.reset_index(inplace = True)
    
    B_index = list(range(B.shape[0]))
    random.shuffle(B_index)
    n = n
    B1_index = []
    for i in range(n):
        B1_index.append(B_index.pop())
   
        
    B1 = B.iloc[B1_index,:]
    B1.reset_index(inplace = True)
    B1_vec = np.array([nlp(i).vector for i in B1['cleaned_tweet']])
    
    
    consim_twt_list = []
    for i in range(n):  
        consim_twt_list.append((cosine_similarity(B1_vec[i].reshape(1,-1),A_vec_mean.reshape(1,-1))[0][0], B1['cleaned_tweet'][i]))
    pd.options.display.max_colwidth = 200
    result = pd.DataFrame(consim_twt_list, columns = ['score','tweet'])
    print(result.shape)
    result_90 = result[result['score']>.90]
    result_90.sort_values('score', axis = 0, ascending = False)
    return result_90['tweet'].values

In [None]:
Top_tweets_in_b('paris|climate', n = 30000)

In [None]:
event_tweet_count('nationaldonutday')

In [None]:
Top_tweets_in_b('nationaldonutday')

In [None]:
tweets_event_ab_test('nationaldonutday', n = 200)