In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [8]:
data = pd.read_csv('data/iranian_tweets.csv')

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1122936 entries, 0 to 1122935
Data columns (total 31 columns):
tweetid                     1122936 non-null int64
userid                      1122936 non-null object
user_display_name           1122936 non-null object
user_screen_name            1122936 non-null object
user_reported_location      887669 non-null object
user_profile_description    995845 non-null object
user_profile_url            434954 non-null object
follower_count              1122936 non-null int64
following_count             1122936 non-null int64
account_creation_date       1122936 non-null object
account_language            1122936 non-null object
tweet_language              1117307 non-null object
tweet_text                  1122936 non-null object
tweet_time                  1122936 non-null object
tweet_client_name           1100078 non-null object
in_reply_to_tweetid         339350 non-null float64
in_reply_to_userid          440244 non-null object
quoted_twe

In [10]:
df = data[['userid','tweet_language','tweet_text','is_retweet','hashtags']]

In [11]:
df = df[(df.is_retweet==False)&(df.tweet_language=='en')]

In [12]:
df['tokens'] = df.tweet_text.apply(lambda x: nltk.word_tokenize(x))

In [13]:
df.tokens.values[0:3]

array([list(['@', 'ParkerLampe', 'An', 'inquiry', 'by', 'congress', 'confirms', 'that', 'ISIS', 'is', 'indeed', 'a', 'CIA', 'creation', 'http', ':', '//t.co/eFRmFwYZTV']),
       list(['@', 'hadeelhmaidi', '@', 'wordpressdotcom', 'CIA', 'predict', 'third', 'terrorist', 'attack', 'after', 'Sidney', 'and', 'Pakiistan', 'in', 'USA', 'in', '3', 'days', 'http', ':', '//t.co/IrPx7M223N']),
       list(['@', 'irfhabib', 'why', 'boko', 'haram', 'come', 'europe', ',', 'legally', 'and', 'easily', '?', 'http', ':', '//t.co/on2vzPqEPH'])],
      dtype=object)

In [14]:
def tweet_tokenize(tweet):
    twt = TweetTokenizer(reduce_len=True, strip_handles=True)
    return twt.tokenize(tweet)

In [15]:
df['tweet_tokens'] = df.tweet_text.apply(lambda x: tweet_tokenize(x))

In [16]:
df.tweet_tokens.values[5:10]

array([list(['What', 'would', 'happen', 'to', 'you', 'if', 'you', 'were', 'a', 'political', 'dissent', 'in', 'Saudi', 'Arabia', '?', 'http://t.co/n1pHsz1UsX']),
       list(['Saudi', 'embassy', 'in', 'Turkey', 'became', 'a', 'safe', 'haven', 'for', 'ISIL', 'terrorists', 'http://t.co/BsMnfC1xqU']),
       list(['We', 'can', 'destroy', 'Israel', 'in', '‘', 'less', 'than', '12', 'minutes', '’', ':', 'Pakistani', 'commander', 'https://t.co/CSkZ4Z6ke1']),
       list(['what', 'they', 'will', 'never', 'tell', 'you', 'about', 'Christmas', 'http://t.co/ovd4gGlWLE']),
       list(['is', 'there', 'a', 'secret', 'collaboration', 'between', 'U', '.', 'S', 'and', 'Iran', 'against', 'isis', '?', 'http://t.co/Fw1IH3qOCb'])],
      dtype=object)

In [17]:
def strip_stops(tweet):
    stops = set(nltk.corpus.stopwords.words('english'))
    return [i.lower() for i in tweet if i.lower() not in stops]

In [18]:
# Remove stop words
df['topic_words'] = df.tweet_tokens.apply(strip_stops)

In [19]:
# Remove links
df['topic_words'] = df['topic_words'].apply(lambda x: [i for i in x if i[0:4]!='http'])

In [37]:
# Remove stray punctuation
import string
punc = set(string.punctuation+'‘’…°–—“”')
df['topic_words'] = df['topic_words'].apply(lambda x: [i for i in x if i[0] not in punc])

In [38]:
df.topic_words.values[0:5]

array([list(['inquiry', 'congress', 'confirms', 'isis', 'indeed', 'cia', 'creation']),
       list(['cia', 'predict', 'third', 'terrorist', 'attack', 'sidney', 'pakiistan', 'usa', '3', 'days']),
       list(['boko', 'haram', 'come', 'europe', 'legally', 'easily']),
       list(['isis', 'militants', 'plan', 'target', 'western', 'capitals']),
       list(['turkish', 'intelligence', 'chief', 'isis', 'reality', 'optimistic', 'future'])],
      dtype=object)

In [39]:
def lemmatizer(tokens):
    lem = nltk.stem.WordNetLemmatizer()
    return [lem.lemmatize(i) for i in tokens]

In [40]:
df['lemma'] = df.topic_words.apply(lemmatizer)

In [41]:
df.lemma.values[0:10]

array([list(['inquiry', 'congress', 'confirms', 'isi', 'indeed', 'cia', 'creation']),
       list(['cia', 'predict', 'third', 'terrorist', 'attack', 'sidney', 'pakiistan', 'usa', '3', 'day']),
       list(['boko', 'haram', 'come', 'europe', 'legally', 'easily']),
       list(['isi', 'militant', 'plan', 'target', 'western', 'capital']),
       list(['turkish', 'intelligence', 'chief', 'isi', 'reality', 'optimistic', 'future']),
       list(['would', 'happen', 'political', 'dissent', 'saudi', 'arabia']),
       list(['saudi', 'embassy', 'turkey', 'became', 'safe', 'isil', 'terrorist']),
       list(['destroy', 'israel', 'le', '12', 'minute', 'pakistani', 'commander']),
       list(['never', 'tell', 'christmas']),
       list(['secret', 'collaboration', 'u', 'iran', 'isi'])],
      dtype=object)

In [42]:
def dum_preprocess(s):
    return s
def dum_tokenizer(s):
    return s

In [61]:
num_feats = 1000
vctr = TfidfVectorizer(analyzer='word',tokenizer=dum_tokenizer,preprocessor=dum_preprocess,max_features=num_feats,token_pattern=None)
X = vctr.fit_transform(df.lemma)

In [62]:
tfidf_vals = X.toarray()

MemoryError: 

In [None]:
feature_names = np.array(vctr.get_feature_names())
feature_names

In [None]:
tfidf_results = pd.DataFrame(data = tfidf_vals, columns=feature_names)
tfidf_results.head()

In [None]:
n_components = 5
nmf = NMF(n_components=n_components, random_state=1, alpha=0.1, solver='mu', max_iter=1000, l1_ratio=0.5).fit(X)
nmf

In [None]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        print("Topic #{0}: {1}".format(topic_idx, top_words))
    print()

In [None]:
n_top_words = 10
print_top_words(nmf, feature_names, n_top_words)