In [1]:
import pandas as pd
import numpy as np

# from matplotlib import pyplot as plt
import matplotlib

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction import text
from sklearn.decomposition import LatentDirichletAllocation,TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from sklearn.metrics import silhouette_score

import warnings
import math
warnings.filterwarnings('ignore')


In [2]:
tweets = pd.read_csv("output_got.csv", sep=None, error_bad_lines=False, warn_bad_lines=False)

state_ranks_df = pd.read_csv("state_rankings.csv")
state_ranks_df.index = state_ranks_df.Name
state_ranks_df.drop(["Name","Count"], axis=1, inplace=True)
state_ranks_df.head(5)

Unnamed: 0_level_0,Employment & Earnings,Political Participation,Poverty & Opportunity,Reproductive Rights,Health & Well-Being,Work & Family
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Alabama,46,41,45,40,50,39
Alaska,7,33,12,29,27,15
Arizona,34,14,35,24,28,38
Arkansas,47,47,50,43,47,8
California,15,8,23,9,17,2


In [7]:
stemmer = SnowballStemmer("english")
tokenizer = RegexpTokenizer("[a-z']+")

# stop_words = text.ENGLISH_STOP_WORDS.union(frozenset(['http','https', 'don', 'gl','www','twitter', 
#                                                       'got','bit','women', 'woman', 'like', 'thank', 'instagram', 'fb', 'ly', 
#                                                       'goo', 'status', 'atus', 'st', 'tatus','repost', 'did', 'sta', 'tus', 'youtu', 
#                                                       'com', 'pic','statu', 'facebook', 'youtube', 'li', 'll', '01', '2017', 
#                                                       'make', 'let', 'need', '31', 'rt', 'ln']))

def tokenize(text):
    tokens = tokenizer.tokenize(text)
    return [stemmer.stem(t) for t in tokens] 

def get_tf(data, use_idf, max_df=1.0, min_df=1, ngram_range=(1,1)):
    if use_idf:
        m = TfidfVectorizer(max_df=max_df, min_df=min_df, stop_words='english', ngram_range=ngram_range, tokenizer=tokenize)
    else:
        m = CountVectorizer(max_df=max_df, min_df=min_df, stop_words='english', ngram_range=ngram_range, tokenizer=tokenize)
    
    d = m.fit_transform(data)
    return m, d

# tf_m, tf_d = get_tf(data['Text'], use_idf=False, max_df=max_df, min_df=3, max_features=1000, stop_words=stop_words)

In [8]:
n_topics = 4

def get_lda(data, topics):
    m = LatentDirichletAllocation(n_topics=topics, max_iter=20, learning_method='online', \
                                  learning_offset=50., random_state=0).fit(data)
    d = m.transform(data)
    return m, d

def get_kmeans(data, k, scale=True):
    if scale:
        s = MinMaxScaler()
        data = s.fit_transform(data)
    
    m = KMeans(n_clusters=k, random_state=0).fit(data)
    d = m.predict(data)
    return m, d        

# lda_m, lda_d = get_lda(tf_d, n_topics)
kmean_m, kmean_d = get_kmeans(state_ranks_df, n_topics, scale=False)

In [9]:
def show_topics(model, feature_names, n_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-n_words - 1:-1]]))
    print()
    
def show_cluster_topics(cluster_labels, tf_matrix, feature_names, n_words):
    d = pd.DataFrame(tf_matrix.toarray())
    d['c'] = cluster_labels
    d = d.groupby('c').sum().T
    
    for col in d:
        top_n = d[col].nlargest(n_words).index.tolist()
        print("Cluster #%d:" % col)
        print(", ".join([feature_names[i]
                for i in top_n]))
    print()
    
# print("Top 15 stemmed words per topic in LDA model\n")
# show_topics(lda_m, tf_m.get_feature_names(), 15)

# print("Top 15 stemmed words per cluster in Kmeans model\n")
# show_cluster_topics(kmean_d, tfidf_d, tfidf_m.get_feature_names(), 15)

In [10]:
state_ranks_df['Cluster'] = kmean_m.labels_.tolist()
state_ranks_df.head(5)

Unnamed: 0_level_0,Employment & Earnings,Political Participation,Poverty & Opportunity,Reproductive Rights,Health & Well-Being,Work & Family,Cluster
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Alabama,46,41,45,40,50,39,3
Alaska,7,33,12,29,27,15,1
Arizona,34,14,35,24,28,38,0
Arkansas,47,47,50,43,47,8,3
California,15,8,23,9,17,2,1
