In [None]:
import pandas as pd  
DF = pd.read_csv("saved_tweets.csv",header=None)  

DF.columns = ["content","hashtags","screen_name","location"]
#print(DF)

print(DF.iloc[774,:]["content"])

In [None]:
from collections import Counter  
import ast

# Extract hashtags and put them in a list
list_hashtag_strings = [entry for entry in DF.hashtags]  
list_hashtag_lists = ast.literal_eval(','.join(list_hashtag_strings))  
hashtag_list = [ht.lower() for list_ in list_hashtag_lists for ht in list_]

# Count most common hashtags
counter_hashtags = Counter(hashtag_list)  
counter_hashtags.most_common(20)

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud
# help(WordCloud)

def tag_cloud(tokens, stop_set = None):
    wc = WordCloud(stopwords = stop_set).generate(' '.join(tokens))
    plt.figure(figsize=(12,12),dpi=200)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    #plt.savefig('wordcloud.png',bbox_inches='tight')
    plt.show()
    plt.close()

tokens = list()
for s in DF["content"]:
    tokens.append([token.lower() for token in s.split()])

tokens = [token for sublist in tokens for token in sublist]
    

tag_cloud(tokens)

### affective computing: dictionary-based sentiment analysis

In [None]:
def sent_vect(tokens,center=False):
    """
    Lab MT sentiment score for lists of tokens in list tokens
    - scores are zero-centered
    """
    labmt = pd.read_csv("labmt_dict.csv", 
                        sep="\t", encoding="utf-8", index_col=0)
    if center:
        avg = labmt.happiness_average.mean()
        sent_dict = (labmt.happiness_average - avg).to_dict()
    else:
        sent_dict = (labmt.happiness_average).to_dict()
    out = [sent_dict.get(token,0.0) for token in tokens]
    
    return out

content = DF["content"].tolist()

sentiment_array = []
for s in content:
    tokens = [token.lower() for token in s.split()]
    score = sum(sent_vect(tokens))/len(tokens)
    sentiment_array.append(score)

DF["sentiment"] = sentiment_array
DF.to_csv("saved_tweets_annotated.csv",index=False)

print(DF.head(10))

In [None]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

def tokenize(s,n=0):
    pattern = re.compile(r"\W+")
    return [token for token in pattern.split(s) if len(token) > n]

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic {}:".format(topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
content = DF["content"].tolist()        
# vector space
no_features = 1000
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(content)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# model
no_topics = 5
mdl = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd')

nmf = mdl.fit(tfidf)

W = mdl.fit_transform(tfidf)
H = mdl.components_


# document representation
W = mdl.fit_transform(tfidf)
# dictionary
H = mdl.components_

# inspect
no_top_words = 10
display_topics(nmf, tfidf_feature_names, no_top_words)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
pca.fit(W)

print(pca.explained_variance_ratio_)  
print(pca.singular_values_)  


W2d = pca.transform(W)

color_list = list()
threshold = sum(sentiment_array)/len(sentiment_array)
for val in sentiment_array:
    if val < threshold:
        color_list.append("b")
    elif val > threshold:
        color_list.append("r")
    else:
        color_list.append("y")

%matplotlib inline

plt.scatter(W2d[:,0], W2d[:,1],c = color_list);
plt.axis([-.04,.075,-.075,.075])

In [None]:
import glob, os
from natsort import natsorted

fnames = natsorted(glob.glob(os.path.join("russian-troll","*.csv")))

df = list()
for fname in fnames[:3]:
    df.append(pd.read_csv(fname,low_memory=False))
df = pd.concat(df,axis=0)

content = df["content"].tolist()

# vector space
no_features = 1000
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(content)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# model
no_topics = 25
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
# inspect
no_top_words = 10
display_topics(nmf, tfidf_feature_names, no_top_words)