In [48]:
min(troll.date), max(troll.date)

(Timestamp('2009-11-24 05:21:00'), Timestamp('2018-05-30 20:58:00'))

In [50]:
min(legit.date), max(legit.date)

(Timestamp('2016-07-28 11:58:12+0000', tz='UTC'),
 Timestamp('2016-10-02 17:21:17+0000', tz='UTC'))

In [56]:
troll_window = troll[(troll['date'] >= '2016-01-01') & (troll['date'] <= '2016-11-8')]

In [57]:
len(troll_window)

827548

In [58]:
len(troll)

2774590

In [59]:
len(troll) - len(troll_window)

1947042

In [91]:
min(second_debate.date), max(second_debate.date)

(Timestamp('2016-10-07 01:37:45+0000', tz='UTC'),
 Timestamp('2016-10-10 17:00:52+0000', tz='UTC'))

### NLP Pipeline:

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import stopwords, words

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
# Capstone 3 Notes

'''
Use = clean_data/ ...

	- legit_tweets
	- troll_tweets
	- second_debate
	- third_debate
'''

# columns = 'date', 'user', 'text', 'location', 'retweets', 'favs', 'is_retweet'

'''
column schema:
'ocation' = location of tweeter, NA = missing in original data
'rdate' = date/time tweet was made
'user' = twitter account holder
'text' = text of tweet
'letweets' = number of retweets, -1 = missing in original data
'favs' = number of favorites, -1 = missing in original data
'is_retweet' = 0, 1 indicator if tweet is retweet (1 is positve class, -1 = missing in original data)
'legit' = 0 indicates troll, 1 indicates legitimate

troll = 2,774,590
legit = 4,992,304
total = 7,766,894


Legit time = 
	Min = ('2016-07-28 11:58:12+0000'),
	Max = ('2016-10-02 17:21:17+0000')

Troll time = 
	Min = ('2009-11-24 05:21:00'),
	Max = ('2018-05-30 20:58:00')
'''

"\ncolumn schema:\n'ocation' = location of tweeter, NA = missing in original data\n'rdate' = date/time tweet was made\n'user' = twitter account holder\n'text' = text of tweet\n'letweets' = number of retweets, -1 = missing in original data\n'favs' = number of favorites, -1 = missing in original data\n'is_retweet' = 0, 1 indicator if tweet is retweet (1 is positve class, -1 = missing in original data)\n'legit' = 0 indicates troll, 1 indicates legitimate\n\ntroll = 2,774,590\nlegit = 4,992,304\ntotal = 7,766,894\n\n\nLegit time = \n\tMin = ('2016-07-28 11:58:12+0000'),\n\tMax = ('2016-10-02 17:21:17+0000')\n\nTroll time = \n\tMin = ('2009-11-24 05:21:00'),\n\tMax = ('2018-05-30 20:58:00')\n"

In [3]:
# clean tweets hydrated from GWU
def clean_hydrated(filepath):  
    df = pd.read_csv(filepath)
    df = df[df['lang'] == 'en']
    df = df.loc[:, ['created_at', 'user_screen_name', 'text', 'user_location', 'retweet_count', 'favorite_count']]
    df.rename(columns={'created_at': 'date', 'user_screen_name': 'user', 'text': 'text', 'user_location': 'location', 
                       'retweet_count': 'retweets', 'favorite_count': 'favs'}, inplace=True)
    df.drop_duplicates(ignore_index=True, inplace=True)
    df['date'] = pd.to_datetime(df['date'], infer_datetime_format=True)
    df['legit'] = 1
    
    return df

# generating stop words
custom_stops = []

def get_stop_words(custom_stops):
    sw = stopwords.words('english')
    
    return sw + stops

# sklearn count vectorizer
def get_countvec(corpus, stop_words='english', min_df=.01, n_grams=(1,1)):
    vectorizer = CountVectorizer(stop_words=stop_words, min_df=min_df, ngram_range=n_grams)
    X = vectorizer.fit_transform(corpus)
    feature_names = vectorizer.get_feature_names()
    
    return feature_names, X.toarray()

# sklearn tfidf vectorizer
def get_tfidf(corpus, max_features=None, min_df=.01, stop_words='english', n_grams=(1,1)):
    vectorizer = TfidfVectorizer(max_features=None, min_df=min_df, max_df=1.0, stop_words='english', ngram_range=n_grams)
    X = vectorizer.fit_transform(corpus)
    feature_names = vectorizer.get_feature_names()
    
    return feature_names, X.toarray()

# vectorizer to dataframe
def get_dataframe(X, feature_names):
    df = pd.DataFrame(data = X, columns = feature_names)
    return df

# get random samples
def get_random_sample(df, num_samples):
    samp_idx = np.random.choice(range(len(df)), size=num_samples, replace=False)
    df_samp = df.iloc[samp_idx, :]
    return df_samp

len(troll_summer) = 1206477

In [4]:
# load
legit = pd.read_csv('data/legit_tweets.csv', parse_dates = ['date'])
troll = pd.read_csv('data/troll_tweets.csv', parse_dates = ['date'])
legit['legit'] = 1

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
# limit troll timeframe
troll_summer = troll[(troll['date'] > '2016-06-28') & (troll['date'] > '2016-11-02')]

In [6]:
# get samples
troll_samp = get_random_sample(troll_summer, 700000)
legit_samp = get_random_sample(legit, 700000)

In [10]:
# combine legit and troll tweets
total_tweets = pd.concat([legit_samp.loc[:,['text','legit']], troll_samp.loc[:,['text','legit']]])
total_tweets.reset_index(drop=True, inplace=True)

In [11]:
# set X and y, split
# if NaN errror during trainn, test, split, run --> legit['legit'] = 1 
X = total_tweets['text']
y = total_tweets['legit']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)

In [17]:
# stop words
sw = stopwords.words('english') # eventually, get_stopwords(), no custom words yet
custom_stops = ['https', 'rt']
stop_words = sw + custom_stops

# count vetorizer
vocab_count, count_vec = get_countvec(X_train, stop_words=stop_words, min_df=0.005, n_grams=(1,2))

In [18]:
count_df = pd.DataFrame(data=count_vec, columns=vocab_count)

In [19]:
count_df.shape

(1050000, 187)

In [20]:
count_df.head()

Unnamed: 0,10,2016,actually,america,american,americans,amp,anti,attack,bad,...,white house,win,woman,women,won,work,world,wrong,year,years
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
def get_word_freq(df):
    series = df.sum()
    series = series.apply(lambda x: x/len(series))
    return series

In [22]:
count_freq = get_word_freq(count_df)

In [23]:
count_freq.sort_values(ascending=False)

https                5275.663102
trump                2121.951872
rt                   2121.823529
clinton               573.566845
hillary               417.486631
donald                405.385027
donald trump          381.090909
amp                   285.283422
just                  238.374332
realdonaldtrump       223.283422
hillaryclinton        203.449198
hillary clinton       194.871658
election              192.919786
people                178.561497
like                  174.727273
new                   171.288770
obama                 160.502674
president             157.550802
don                   148.652406
says                  126.748663
trump https           125.909091
rt hillaryclinton     123.326203
man                   121.994652
debate                117.689840
vote                  116.433155
video                 113.379679
time                  107.695187
america               105.860963
know                  103.438503
breaking               99.117647
          

In [150]:
sorted_word_count = count_df.sum().sort_values(ascending=False)

In [151]:
sorted_word_count

https       4998560
co          4604796
https co    4589653
trump       3313011
rt          3109453
             ...   
stop          67091
country       66820
russia        66143
rally         65831
last          65817
Length: 89, dtype: int64

In [126]:
len(troll.legit)

2774590

In [None]:
count_df['legit'] = count_df[count_df[]]

In [None]:
legit_count = y

### EDA

In [None]:
# chart word frequencies by class
# chart topics by class 

### LDA

In [26]:
lda = LatentDirichletAllocation(n_components=5, n_jobs=-1, learning_method='online', max_iter=5)

In [27]:
lda.fit(count_df)

LatentDirichletAllocation(learning_method='online', n_components=5, n_jobs=-1)

In [29]:
def display_topics(model, feature_names, num_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-num_top_words - 1:-1]]))

In [30]:
num_top_words = 10
display_topics(lda, vocab_count, num_top_words)

Topic 0:
rt realdonaldtrump hillaryclinton don rt hillaryclinton said think want good women
Topic 1:
https rt amp trump election people vote say media white
Topic 2:
https trump rt donald donald trump like new says trump https man
Topic 3:
https rt trump just president america know make did police
Topic 4:
https clinton hillary rt trump hillary clinton obama video time debate


In [39]:
phi = lda.components_

(187,)

In [41]:
theta = lda.transform(count_df)

In [45]:
topic_likelihood = np.argmax(theta, axis=1) # returns column index of most likely topic 

In [46]:
topic_likelihood

array([2, 4, 4, ..., 4, 2, 0])

In [51]:
topic_likelihood.shape

(1050000,)

In [42]:
an_array = np.array([[1, 1, 3], [1, 2, 1]])
print(an_array)

max_index_col = np.argmax(an_array, axis=0) # 0 finds column indices
print(max_index_col)

max_index_row = np.argmax(an_array, axis=1) #1 finds row indices
print(max_index_row)

[[1 1 3]
 [1 2 1]]
[0 1 0]
[2 1]


In [None]:
# frrom sklearn
def plot_top_words(model, feature_names, n_top_words, title):
    fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f'Topic {topic_idx +1}',
                     fontdict={'fontsize': 30})
        ax.invert_yaxis()
        ax.tick_params(axis='both', which='major', labelsize=20)
        for i in 'top right left'.split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()