# Iranian Tweet EDA and Topic Modeling

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import string
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

### Data Filtering

In [2]:
data = pd.read_csv('data/iranian_tweets.csv')

In [70]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1122936 entries, 0 to 1122935
Data columns (total 31 columns):
tweetid                     1122936 non-null int64
userid                      1122936 non-null object
user_display_name           1122936 non-null object
user_screen_name            1122936 non-null object
user_reported_location      887669 non-null object
user_profile_description    995845 non-null object
user_profile_url            434954 non-null object
follower_count              1122936 non-null int64
following_count             1122936 non-null int64
account_creation_date       1122936 non-null object
account_language            1122936 non-null object
tweet_language              1117307 non-null object
tweet_text                  1122936 non-null object
tweet_time                  1122936 non-null object
tweet_client_name           1100078 non-null object
in_reply_to_tweetid         339350 non-null float64
in_reply_to_userid          440244 non-null object
quoted_twe

In [3]:
df = data[['userid','tweet_language','tweet_text','is_retweet','hashtags']]

In [4]:
df = df[(df.is_retweet==False)&(df.tweet_language=='en')]

### Tokenize & Lemmatize, Remove Punctuation

In [5]:
# Tfidf vectorizer takes care of stop words; it's on us to remove links, punctuation (can include in token pattern?)

In [39]:
punc = set(string.punctuation+'‘’…°–—“”')
lem = nltk.stem.WordNetLemmatizer()
twt = TweetTokenizer(reduce_len=True, strip_handles=True)

def tweet_tokenize_full(tweet):
    tokens = twt.tokenize(tweet)
    no_punc = [token for token in tokens if (token[0] not in punc) and (len(token)>2)]
    no_links = [token for token in no_punc if token[0:4]!='http']
    lemmatized = [lem.lemmatize(token) for token in no_links]
    return lemmatized

In [40]:
num_feats = 1000
vctr = TfidfVectorizer(analyzer='word',
                       stop_words='english',
                       tokenizer=tweet_tokenize_full,
                       max_features=num_feats)

In [41]:
X = vctr.fit_transform(df.tweet_text)

In [42]:
tfidf_vals = X.toarray()

In [43]:
feature_names = np.array(vctr.get_feature_names())
feature_names

array(['100', '2012', '2015', '2016', '2017', '2018',
       '3081cc5899cfede8252c2015644ffeccd7b27b1149fb', '360',
       '431d28838bc87b3473081cc5899cfede8252c2015644ffeccd7b27b1149fb',
       '8.5', '9/11', 'abandon', 'abu', 'abuse', 'according', 'accurate',
       'accused', 'act', 'action', 'active', 'activist', 'add', 'added',
       'administration', 'advance', 'aerial', 'affair', 'afghanistan',
       'africa', 'age', 'agency', 'agenda', 'agent', 'aggression', 'ago',
       'agree', 'agreement', 'ahead', 'aid', 'air', 'air-base',
       'air-strikes', 'airport', 'airstrikes', 'al-baghdadi', 'al-nusra',
       'al-quds', 'al-waleed', 'aleppo', 'alert', 'ali', 'allah',
       'alliance', 'ally', 'alter', 'amazing', 'ambassador', 'america',
       'american', 'american-concocted', 'amid', 'analyst', 'anniversary',
       'announced', 'anti', 'applies', 'arab', 'arabia', 'archaeologist',
       'area', 'arm', 'armed', 'army', 'arrest', 'arrested', 'ash', 'ask',
       'asked', 'ass

In [44]:
tfidf_results = pd.DataFrame(data = tfidf_vals, columns=feature_names)

In [51]:
n_components = 4
nmf = NMF(n_components=n_components, random_state=1, alpha=0.1, solver='mu', max_iter=1000, l1_ratio=0.5).fit(X)

In [52]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        print("Topic #{0}: {1}".format(topic_idx, top_words))
        print('\n')
    print()

In [53]:
n_top_words = 20
print_top_words(nmf, feature_names, n_top_words)

Topic #0: ['day', 'attack', 'predict', 'usa', 'terrorist', 'cia', 'pakistan', 'sidney', 'revealed', 'australia', 'pakiistan', 'turkey', 'inquiry', 'confirms', 'creation', 'congress', 'formed', '2012', 'isi', 'putin']


Topic #1: ['state', 'united', 'racist', 'society', 'structurally', 'nation', "ferguson's", 'set', 'inferno', 'prouve', 'infiltrated', 'doe', 'photo', 'ha', 'suffering', 'citizen', 'abuse', 'power', 'police', 'bieber']


Topic #2: ['isi', 'success', 'sydney', 'siege', 'sex', 'american', 'open', 'consulate', 'istanbul', 'recruit', 'declarartion', 'young', 'jihad', 'lady', 'fbi', 'turkish', 'decleration', 'girl', 'tell', 'yazidi']


Topic #3: ['saudi', 'arabia', 'israel', 'prince', 'trump', 'new', 'donald', 'talal', 'visit', 'age', 'peace', '9/11', 'victim', 'shall', 'mark', 'hand', 'iran', 'blood', 'syria', '3081cc5899cfede8252c2015644ffeccd7b27b1149fb']



