<div style='text-align: center;'>
    <h1>#LaManada</h1>
    <h2>Mini-Project CSS Summer School</h2>
    <div>July 30th - August 4th, 2018</div>
    <div>Marina Del Rey, C.A.</div>
</div>

<p>
    <b>Members</b>
    <ul>
        <li>Blanca Ramirez, USC (USA)</li>
        <li>Tayrine Dias, UOC (Spain)</li>
        <li>Lisette Espin-Noboa, GESIS (Germany)</li>
    </ul>
</p>

<h2>Descriptive Analysis</h2>

In [None]:
####################################################
# GENERAL DEPENDENCES
####################################################
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import networkx as nx
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import multidict as multidict
import re
from PIL import Image

In [None]:
####################################################
# NLP DEPENDENCES
####################################################
import nltk  
from nltk.corpus import stopwords  
from nltk import word_tokenize  
from nltk.data import load  
from nltk.stem import SnowballStemmer  
from string import punctuation  
from sklearn.feature_extraction.text import CountVectorizer

#stopword list to use
spanish_stopwords = stopwords.words('spanish')
my_stopwords = ['me', 'las', 'es', 'un', 'mi', 'con', 'ser', 'los', 'si', 'ha', 'hasta', 'o', 'de', 'cuando', 'http', 'su', 'twitter', 'er', 'como', 'to', 'le', 'se', 'en', 'lo', 'a', 'tgo', 'toy', 'tu', 'el', 'por', 'una', 'al', 'para', 'la', 'pero', 'que', 'da', 'https', 'y','q','del','xq','les','mis','te','sí','ya','i','porque','por que','por qué','era','cada','nos','pero','ni']
with open('stopwords_ca.txt','r') as f:
    catalan_stopwords=[line.replace('\n','') for line in f.readlines()]
all_stopwords = set(my_stopwords)
all_stopwords |= set(spanish_stopwords)
all_stopwords |= set(catalan_stopwords)

#spanish stemmer
stemmer = SnowballStemmer('spanish')

#punctuation to remove
non_words = list(punctuation)
#we add spanish punctuation
non_words.extend(['¿', '¡','"',"'","“","”","‘","’","$","€","<",">","^","`","~","«","»"])  
non_words.extend(map(str,range(10)))

stemmer = SnowballStemmer('spanish')  
def stem_tokens(tokens, stemmer):  
    stemmed = []
    for item in tokens:
        if item not in all_stopwords:
            stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):  
    # remove punctuation
    text = ''.join([c for c in text if c not in non_words])
    # tokenize
    tokens =  word_tokenize(text)

    # stem
    try:
        stems = stem_tokens(tokens, stemmer)
    except Exception as e:
        print(e)
        print(text)
        stems = ['']
    return stems

In [None]:
####################################################
# CONSTANTS
####################################################
K = 10
SEP = ','

In [None]:
####################################################
# FUNCTIONS (HANDLERS)
####################################################
def getFrequencyDictForText(sentence):
    fullTermsDict = multidict.MultiDict()
    tmpDict = {}
    # making dict for counting frequencies
    for text in sentence.split(" "):
        val = tmpDict.get(text, 0)
        tmpDict[text.lower()] = val + 1
    for key in tmpDict:
        fullTermsDict.add(key, tmpDict[key])
    return {k:v for k,v in fullTermsDict.items() if v>1}

def makeImage(text, width, height, shape):
    wc = WordCloud(background_color="white", max_words=2000, width=width, height=height, stopwords=all_stopwords)
    wc.generate_from_frequencies(text)
    plt.figure(figsize=shape)
    plt.imshow(wc, interpolation="bilinear")
    plt.xticks([])
    plt.yticks([])

def _get_text(values):
    return ' '.join([_get_tokens(t) for t in values])
    
def _get_tokens(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'^http?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = ''.join([c for c in text if c not in non_words])
    tokens =  word_tokenize(text)
    text = []
    for item in tokens:
        if item not in all_stopwords:
            text.append(item)
    text = ' '.join(text)
    return text
    
def remove_stop_words_es(text):
    text = text.lower()
    for word in all_stopwords:
        text = text.replace(' {} '.format(word),'')
        text = text.replace('{} '.format(word),'') if text.startswith('{} '.format(word)) else text
        text = text.replace(' {}'.format(word),'') if text.endswith(' {}'.format(word)) else text
        
    text = text.replace('?','').replace('.','').replace(',','').replace(':','').replace(';','').replace('"','').replace("'","")
    return text


In [None]:
####################################################
# LOADING THE DATA
####################################################
users = pd.read_csv('LaManada_new/tbluserinfo.csv',sep=SEP)
users.shape

In [None]:
tweets = pd.read_csv('LaManada_new/tblposts.csv',sep=SEP)
tweets.shape

In [None]:
retweets = pd.read_csv('LaManada_new/tblretweets.csv',sep=SEP)
retweets.shape

In [None]:
replies = pd.read_csv('LaManada_new/tblreplies.csv',sep=SEP,quotechar='"')
replies.shape

<h3>Columns</h3>

In [None]:
users.columns

In [None]:
tweets.columns

In [None]:
retweets.columns

In [None]:
replies.columns

<h3>Who are tweeting about #LaManada?</h3>

In [None]:
users["created_at"] = users["created_at"].astype("datetime64")

In [None]:
users.query("created_at < '1984-01-01 00:00:00'")

In [None]:
users = users.query("created_at > '1984-01-01 00:00:00'") #removing those 2 outliers

<h3>Accounts creation date</h3>
There is a peak of ~4000 users that were created in 2018 (probably trolls?)

In [None]:
ax = users.created_at.hist(bins=12)
ax.set_xlabel('Date')
ax.set_ylabel('# Users')
ax.set_title("Users' account creation per year")

In [None]:
ax = users.created_at.hist(bins=144)
ax.set_xlabel('Date')
ax.set_ylabel('# Users')
ax.set_title("Users' account creation per month")

In [None]:
ax = users.created_at.hist(bins=4380)
ax.set_xlabel('Date')
ax.set_ylabel('# Users')
ax.set_title("Users' account creation per day")

In [None]:
# 27-April - 4 May
users.loc[:,'creation_date'] = users.created_at.dt.date
tmp = users.groupby(['creation_date']).size().reset_index()
tmp.rename(columns={0:'users'},inplace=True)
ax=sns.scatterplot(data=tmp, x='creation_date', y='users')

<h3>User's # Favorites (in general)</h3>

In [None]:
ax = users.favourites_count.hist(bins=500)
ax.set_yscale('log')
ax.set_xscale('log')
ax.set_xlabel('Favorite Counts')
ax.set_ylabel('# Users')
ax.set_title("Users' Favorite counts")

<h3>User's listed count</h3>
Around 1M users have been listed in $\leq$ 100 lists. <br />
Only a few users (~10) have been listed in at least 2K lists.

In [None]:
ax = users.listed_count.hist(bins=500)
ax.set_yscale('log')
ax.set_xscale('log')
ax.set_xlabel('Lists Counts')
ax.set_ylabel('# Users')
ax.set_title("Users' Lists counts")

<h3>#Friends vs #Followers</h3>
Verified accounts (right columns) tend to have more followers and a few friends compared to non-verified accounts <br />
This makes sense, since verified users are usually famous people such as celebrities, politicians, etc. Therefore, they have more followers than 'normal' users. <br />
There is no significant different between users with geo_enabled 0 or 1.

In [None]:
g = sns.FacetGrid(users, col="verified",  row="geo_enabled", margin_titles=True)
g = g.map(sns.regplot, 'followers_count', 'friends_count', fit_reg=False)

<h2>Most Active Users</h2>
These are the top 10 users who posted/retweeted the most. <br />
To be an active user, one must have tweeted at least 5 different posts.<br />
None of them are verified accounts.

<h3>All Posts</h3>

In [None]:
# Most active users posting new tweets and retweeting
# removing same-text tweet posted multiple times per user.
uniquetweets = tweets.groupby(['snsuserid']).text.nunique().reset_index() 
uniquetweets.rename(columns={'text':'nu_tweets'},inplace=True)
useractivity = uniquetweets.query("nu_tweets>5") #112448
useractivity.reset_index()
useractivity.sort_values('nu_tweets', ascending=False, inplace=True)
useractivitytopk_all = useractivity.iloc[:K,:]
for i,row in useractivitytopk_all.iterrows():
    useractivitytopk_all.loc[i,'name'] = users.query("snsuserid==@row.snsuserid")['screen_name'].values[0]
    useractivitytopk_all.loc[i,'verified'] = users.query("snsuserid==@row.snsuserid")['verified'].values[0]
ax = sns.barplot(data=useractivitytopk_all, y='nu_tweets', x='name')
ax.set_yscale('log')
ax.set_ylabel('# All Posts')
ax.set_xlabel('Top{} Active Users'.format(K))
ax.set_title('Most Active Users')
ax.xaxis.set_tick_params(rotation=90)
plt.grid()

<h3>What are they talking about?</h3>

In [None]:
tmp = tweets.query("snsuserid.isin(@useractivitytopk_all.snsuserid.values)")
text = _get_text(tmp.text.values)
makeImage(getFrequencyDictForText(text), width=400, height=400, shape=(6,6))

<h3>Only New Tweets</h3>

In [None]:
# Most active users posting new tweets
uniquetweets = tweets.query("isaRetweet==0").groupby(['snsuserid']).text.nunique().reset_index() 
uniquetweets.rename(columns={'text':'nu_tweets'},inplace=True)
useractivity = uniquetweets.query("nu_tweets>5") #112448
useractivity.reset_index()
useractivity.sort_values('nu_tweets', ascending=False, inplace=True)
useractivitytopk_tweets = useractivity.iloc[:K,:]
for i,row in useractivitytopk_tweets.iterrows():
    useractivitytopk_tweets.loc[i,'name'] = users.query("snsuserid==@row.snsuserid")['screen_name'].values[0]
    useractivitytopk_tweets.loc[i,'verified'] = users.query("snsuserid==@row.snsuserid")['verified'].values[0]
ax = sns.barplot(data=useractivitytopk_tweets, y='nu_tweets', x='name')
ax.set_yscale('log')
ax.set_ylabel('# New TweetsAll')
ax.set_xlabel('Top{} Active Users'.format(K))
ax.set_title('Most Active Users')
ax.xaxis.set_tick_params(rotation=90)
plt.grid()

<h3>What are they talking about?</h3>

In [None]:
tmp = tweets.query("snsuserid.isin(@useractivitytopk_tweets.snsuserid.values)")
text = _get_text(tmp.text.values)
makeImage(getFrequencyDictForText(text), width=400, height=400, shape=(6,6))

<h2>Spammers</h2>
These are those users who posted the same tweet multiple times.

In [None]:
possible_bots = users.query("created_at > '2018-04-26'")
possible_bots.shape

In [None]:
tmp = possible_bots.groupby(['followers_count']).size().reset_index()
tmp.rename(columns={0:'nusers'},inplace=True)
ax = sns.scatterplot(data=tmp,y='nusers',x='followers_count')
ax.set_yscale('log')

In [None]:
tweets_possible_bots = tweets.query("snsuserid.isin(@possible_bots.snsuserid.values)")
tweets_possible_bots.shape

In [None]:
tweets_possible_bots.query("isaRetweet==1").shape

In [None]:
for i,r in tweets_possible_bots.query("isaRetweet==1").sort_values(by='numRetweets', ascending=False).iloc[:10].iterrows():
    print(r.numRetweets)
    print(r.numFavourites)
    print(r.snsuserid)
    print(users.query("snsuserid == @r.snsuserid").values)
    print(r.snspostid)
    print(r.text)
    print('')

In [None]:
tweets.query("text.str.contains('repúblicadominicana',False)").shape
# for i,r in tweets.query("text.str.contains('chile',False)").iterrows():
#     print(r.text)
#     print('')

In [None]:
# by repeated tweet
tmp = tweets.groupby(['snsuserid','text']).size().reset_index()
tmp.rename(columns={0:'counts'},inplace=True)
tmp.sort_values(by=['counts'],ascending=False).query("counts > 2").counts.sum()

<h2>Tweets</h2>

In [None]:
tweets["created_at"] = tweets["created_at"].astype("datetime64")

<h2>Tweets Activity</h2>

In [None]:
ax = tweets.created_at.hist(bins=8)
ax.set_xlabel('Date')
ax.set_ylabel('# Tweets')
ax.set_title("Activity per Day")
ax.xaxis.set_tick_params(rotation=90)
plt.grid()

In [None]:
ax = tweets.created_at.hist(bins=192)
ax.set_xlabel('Date')
ax.set_ylabel('# Tweets')
ax.set_title("Activity per Hour")
ax.xaxis.set_tick_params(rotation=90)
plt.grid()

In [None]:
tweets['text_fmt'] = tweets['text'].apply(lambda x:_get_tokens(x))

<h3>All Tweets</h3>
(tweets and retweets)

In [None]:
text = ' '.join(tweets.text_fmt.values)
wc = WordCloud(background_color="white", max_words=5000, width=2000, height=1000, stopwords=all_stopwords)
wc.generate_from_frequencies(getFrequencyDictForText(text))
plt.figure(figsize=(40,20))
plt.imshow(wc, interpolation="bilinear")
plt.xticks([])
plt.yticks([])

<h2>Tweet-Only Text</h2>

In [None]:
text = ' '.join(tweets.query("isaRetweet==0").text_fmt.values)
wc = WordCloud(background_color="white", max_words=5000, width=2000, height=1000, stopwords=all_stopwords)
wc.generate_from_frequencies(getFrequencyDictForText(text))
plt.figure(figsize=(40,20))
plt.imshow(wc, interpolation="bilinear")
plt.xticks([])
plt.yticks([])

<h2>Retweets-Only Text</h2>

In [None]:
text = ' '.join(tweets.query("isaRetweet==1").text_fmt.values)
wc = WordCloud(background_color="white", max_words=5000, width=2000, height=1000, stopwords=all_stopwords)
wc.generate_from_frequencies(getFrequencyDictForText(text))
plt.figure(figsize=(40,20))
plt.imshow(wc, interpolation="bilinear")
plt.xticks([])
plt.yticks([])

<h3>Topic Modeling (NMF)</h3>

In [None]:
tweets['text_fmt_stem'] = tweets['text'].apply(lambda x:stemmer.stem(_get_tokens(x)))

In [None]:
documents = tweets.text_fmt.values

In [None]:
#http://scikit-learn.org/0.18/auto_examples/applications/topics_extraction_with_nmf_lda.html
from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups
n_samples = documents.shape[0]
n_features = 1000
n_topics = 5
n_top_words = 20
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [None]:
# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features)
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(documents)
print("done in %0.3fs." % (time() - t0))

In [None]:
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features)
t0 = time()
tf = tf_vectorizer.fit_transform(documents)
print("done in %0.3fs." % (time() - t0))

In [None]:
# Fit the NMF model
print("Fitting the NMF model with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_topics, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

In [None]:
print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

In [None]:
print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

In [None]:
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

<h2>Retweets</h2>

In [None]:
retweets.head(1)

In [None]:
grouped = retweets.groupby(['parentPostAuthor','parentPost']).size().reset_index()
grouped.rename(columns={0:'counts'},inplace=True)
grouped.sort_values('counts', ascending=False, inplace=True)

In [None]:
most_retweeted = grouped.iloc[:5,:]
most_retweeted

In [None]:
most_retweeted = tweets.loc[tweets.snspostid.isin(grouped.iloc[:100,:].parentPost.astype(np.str).values)]
most_retweeted.head(2)

In [None]:
least_retweeted = tweets.loc[tweets.snspostid.isin(grouped.iloc[-100:,:].parentPost.astype(np.str).values)]
least_retweeted.head(2)

In [None]:
for id,row in tweets.loc[tweets.snspostid.isin(least_retweeted.parentPost.astype(np.str).values)].iterrows():
    print(row.text)
    print('')

In [None]:
text = ' '.join(most_retweeted.text.values.tolist())
for word in all_stopwords:
    text = text.lower().replace(' {} '.format(word),'')
text = text.replace('?','').replace('.','').replace(',','').replace(':','').replace(';','')

In [None]:
makeImage(getFrequencyDictForText(text))

In [None]:
import sys
!{sys.executable} -m pip install nltk