In [1]:
import pandas as pd
import numpy as np

# from matplotlib import pyplot as plt
import matplotlib

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction import text
from sklearn.decomposition import LatentDirichletAllocation,TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from sklearn.metrics import silhouette_score

import warnings
import math
warnings.filterwarnings('ignore')


## KMeans Clustering

In [3]:
state_ranks_df = pd.read_csv("state_rankings.csv")
state_ranks_df.index = state_ranks_df.Name
state_ranks_df.drop(["Name","Tweets Per Person"], axis=1, inplace=True)
state_ranks_df.head(5)

Unnamed: 0_level_0,Employment & Earnings,Political Participation,Poverty & Opportunity,Reproductive Rights,Health & Well-Being,Work & Family
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Alabama,46,41,45,40,50,39
Alaska,7,33,12,29,27,15
Arizona,34,14,35,24,28,38
Arkansas,47,47,50,43,47,8
California,15,8,23,9,17,2


In [4]:
n_topics = 4

def get_kmeans(data, k, scale=True):
    if scale:
        s = MinMaxScaler()
        data = s.fit_transform(data)
    
    m = KMeans(n_clusters=k, random_state=0).fit(data)
    d = m.predict(data)
    return m, d        

kmean_m, kmean_d = get_kmeans(state_ranks_df, n_topics, scale=False)

In [7]:
state_ranks_df['Cluster'] = kmean_m.labels_.tolist()
state_ranks_df.head(5)


Unnamed: 0_level_0,Employment & Earnings,Political Participation,Poverty & Opportunity,Reproductive Rights,Health & Well-Being,Work & Family,Cluster
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Alabama,46,41,45,40,50,39,2
Alaska,7,33,12,29,27,15,0
Arizona,34,14,35,24,28,38,3
Arkansas,47,47,50,43,47,8,2
California,15,8,23,9,17,2,0


## LDA

In [8]:
tweets = pd.read_csv("output_got.csv", sep=None, error_bad_lines=False, warn_bad_lines=False)

texts = tweets['Text'].tolist() 

#Words to ignore
stop_words = text.ENGLISH_STOP_WORDS.union(frozenset(['http','https', 'don', 'gl','www','twitter', 
                                                      'got','bit','women', 'woman', 'like', 'thank', 'instagram', 'fb', 'ly', 
                                                      'goo', 'status', 'atus', 'st', 'tatus','repost', 'did', 'sta', 'tus', 'youtu', 
                                                      'com', 'pic','statu', 'facebook', 'youtube', 'li', 'll', '01', '2017', 
                                                      'make', 'let', 'need', '31', 'rt', 'ln', 'html']))


In [9]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
    
def LDA(documents, max_df=0.95, min_df=2, max_features=1000, n_topics=20, n_top_words=10):
    '''
     tf_vectorizer:
       - Strips out “stop words”
       - Filters out terms that occur in more than 95% of the docs (max_df=0.95)
       - Filters out terms that occur in only one document (min_df=2).
       - Selects the 1,000 most frequently occuring words in the corpus.
       - Normalizes the vector (L2 norm of 1.0) to normalize the effect of 
         document length on the tf_vectorizer values. 
    '''
    tf_vectorizer = CountVectorizer(max_df=max_df, min_df=min_df, max_features=max_features, stop_words=stop_words)
    tf = tf_vectorizer.fit_transform(documents)
    tf_feature_names = tf_vectorizer.get_feature_names()
    
    lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=20, learning_method='online', \
                                    learning_offset=50.,random_state=0).fit(tf)

    display_topics(lda, tf_feature_names, n_top_words)
#     print (lda.transform(tf))
    
LDA(texts)

Topic 0:
time history senators 100 postcard heard send voices source twi
Topic 1:
stat day movement inauguration social start dems c0nvey theresistance supporters
Topic 2:
march womens signs post female join human important blog solidarity
Topic 3:
resist sciencemarch la ow yes action marching en stopsessions check
Topic 4:
just want feminist weekend hope amazing washingtonpost fighting los justice
Topic 5:
muslimban nobannowall theresistance resist blacklivesmatter muslimbanprotest imstillwithher heretostay strongertogether trump
Topic 6:
protests nomuslimban good mmflint pink lsarsour pussy country etsy work
Topic 7:
indivisible html great didn blm liberals sallyyates anti democrats nytimes
Topic 8:
sign support org womensrights ppl muslim moveon inauguration sorry petitions
Topic 9:
read going latest paper 11e6 edition_id days dumptrump 10 soros
Topic 10:
resistance womensmarchonwashington girl know resist broadway hey looks does doesn
Topic 11:
notmypresident maga whyimarch love ma

In [7]:
stemmer = SnowballStemmer("english")
tokenizer = RegexpTokenizer("[a-z']+")

def tokenize(text):
    tokens = tokenizer.tokenize(text)
    return [stemmer.stem(t) for t in tokens] 

def get_tf(data, use_idf, max_df=1.0, min_df=1, ngram_range=(1,1), max_features=100):
    if use_idf:
        m = TfidfVectorizer(max_df=max_df, min_df=min_df, stop_words=stop_words, ngram_range=ngram_range, tokenizer=tokenize)
    else:
        m = CountVectorizer(max_df=max_df, min_df=min_df, max_features=max_features, \
                            stop_words=stop_words, ngram_range=ngram_range, tokenizer=tokenize)
    
    d = m.fit_transform(data)
    return m, d

tf_m, tf_d = get_tf(tweets['Text'], use_idf=False, max_df=0.95, min_df=2, max_features=1000)

In [8]:
n_topics = 20

def get_lda(data, n_topics):
    m = LatentDirichletAllocation(n_topics=n_topics, max_iter=20, learning_method='online', \
                                  learning_offset=50., random_state=0).fit(data)
    d = m.transform(data)
    return m, d

lda_m, lda_d = get_lda(tf_d, n_topics)

In [9]:
def show_topics(model, feature_names, n_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-n_words - 1:-1]]))
    print()
    
print("Top 15 stemmed words per topic in LDA model\n")
show_topics(lda_m, tf_m.get_feature_names(), 15)

Top 15 stemmed words per topic in LDA model

Topic #0:
time, postcard, senat, voic, histori, s, heard, send, sourc, twi, tter, talk, check, anoth, perfect
Topic #1:
march, washington, world, sharia, k, movement, read, g, life, law, pro, buff, islam, z, uk
Topic #2:
w, vote, resist, weekend, help, impeachtrump, democrat, indivis, unit, hey, huffingtonpost, lead, trump, senschum, report
Topic #3:
know, hat, thing, use, pussi, you'r, tweet, pink, didn't, pussyhat, wear, soro, etsi, order, vagina
Topic #4:
just, watch, v, m, youtub, r, t, fight, trump, want, equal, man, video, million, start
Topic #5:
broemmel, love, sr, ref, resist, amazon, bonfil, utf, resisttrump, qid, keyword, mike, stori, true, men
Topic #6:
protest, sign, stat, nomuslimban, mani, inaugur, x, day, org, work, proud, ppl, petit, mmflint, moveon
Topic #7:
resist, nobannowal, muslimban, theresist, notmypresid, blacklivesmatt, don't, strongertogeth, s, imstillwithh, heretostay, aclu, alternativefact, liber, great
Topic #8: