In [146]:
!pip install ipynb
!pip install pandas
!pip install scipy



In [147]:
# from .full.Gensim_Doc_Modelling import *
# import ipynb.fs.defs.Topic_Analysis

from gsdmm_master.gsdmm import MovieGroupProcess
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import Counter


from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from matplotlib import colors as mcolors
import scipy.sparse as sp

In [148]:
def preprocess_dataset(dataset):
    dataset = dataset.iloc[dataset.date_created_at.dropna().index]
    dataset['date_index'] = pd.to_datetime(dataset.date_created_at, errors='coerce')
    dataset = dataset.loc[dataset['date_index'].dropna().index]
    
    datasetusers = []

    for mdate, df in dataset.groupby(pd.Grouper(key='date_index', freq='d')):
        datasetusers.append(df.loc[df.user_id.drop_duplicates().index])

    return pd.concat(datasetusers)

def Map(func, lst):
    return list(map(func, lst))

def Filter(func, lst):
    return list(filter(func, lst))

def Enumerate(lst):
    return list(enumerate(lst))

In [149]:
def get_data_between(dataset, mindate, maxdate):
    lowerbound = dataset[dataset.date_index > mindate]
    return lowerbound[lowerbound.date_index < maxdate]

In [150]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alxau\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [151]:
tokenizer = TweetTokenizer()

sw = stopwords.words('english')

sw = sw + [',', '.', '!', '?', '’', '‘', '(', ')', '”', '“', '@', '/', '-']

def transform_social_text(sentences, lower=True):
    hashtags = re.compile(r"^#\S+|\s#\S+")
    mentions = re.compile(r"^@\S+|\s@\S+")
    urls = re.compile(r"https?://\S+")

    def process_text(text):
        if lower:
            text = text.lower()
        text = hashtags.sub('', text)
        text = mentions.sub('', text)
        text = urls.sub('', text)
        text = text.replace('\n', ' .')

        return text.strip()
    
    return list(map(process_text, sentences))

def tokenize_tweet(tweet):
    word_tokens = tokenizer.tokenize(tweet)
    word_tokens = [w for w in word_tokens if not w in sw]
    return word_tokens

In [152]:
def traing_cluster_algo(data, K, iterations):
    mgp = MovieGroupProcess(K=K, alpha=0.1, beta=0.1, n_iters=iterations)
    mgp.fit(data, 1000)
    return mgp

In [153]:
def get_vocabularies(tokens):
    cc = Counter()
    for lst in tokens:
        cc.update(lst)

    vocab = dict([(i, v) for i, v in list(enumerate(map(lambda x :x[0], cc.most_common())))[:10000]])
    
    ## Add Unk Token
    vocab[len(vocab)] = '$$unk$$'
    
    reverse_vocab = dict([(v , i) for i, v in vocab.items()])
    return vocab, reverse_vocab, '$$unk$$'

# Get Cluster numbers by date
def get_cluster_timeseries(df, mdf):
    timeseries = pd.merge(mdf['cluster'], df[['date_created_at']], how='left', left_index=True, right_index=True)
    return timeseries
    
'''
Returns a tuple of 
    (cluster_size,
    cluster_id, 
    topwords,
    most representative tweet, 
    avg engagement,
    avg followers, 
    score)
'''
def addClusterMetrics(size2cluster, clusters, df, tweets_indexes, tweet_tokens):
    mdf = pd.DataFrame([(mid, np.argmax(clusters.score(doc)), max(clusters.score(doc)), doc) for mid, doc in zip(tweets_indexes, tweet_tokens)])
    mdf = mdf.set_index([0])
    mdf.columns = ['cluster', 'score', 'text']
    
    # Cluster Id x Date
    timeseries = get_cluster_timeseries(df, mdf)
   
    # Get Centroids (tweets)
    result = pd.merge(mdf, df[['text', 'score_engagement', 'counts_followers']], how='left', left_index=True, right_index=True)
    # Aggregate data by cluster (by avg)
    aggregations = result.fillna(0).groupby('cluster').mean()
    idx = result.groupby(['cluster'])['score'].transform(max) == mdf['score']
    rr = result[idx].groupby(['cluster']).first()
    centroids = pd.merge(aggregations, rr, how='left', on='cluster')['text_y']
    
    # Get cluster ids
    cluster_numbers = mdf['cluster'].unique()
    cluster_numbers.sort()
    
    return list(zip(map(lambda x:x[0], size2cluster), 
             cluster_numbers,
             [" ".join([w for w, s in lst]) for lst in map(lambda x:x[1], size2cluster)],
             centroids,
             aggregations['score_engagement'], 
             aggregations['counts_followers'],
             (1 + aggregations['score_engagement']) * np.array(list(map(lambda x:x[0], size2cluster)))  )), timeseries


'''Returns cluster aggregated data and the volume timeseries of clusters'''
def extract_cluster(mgp, dataset, topk, tweets_indexes, tweet_tokens, order_pos=4):
    
    doc_count = mgp.cluster_doc_count
    word_distribution = [
        sorted(list(cluster.items()), key=lambda x: x[1], reverse=True)[:topk]
        for cluster in mgp.cluster_word_distribution
    ]
    
    
    size2cluster = [(d, w) for d, w in zip(doc_count, word_distribution) if d > 0]
    print(mgp.cluster_doc_count)
    size2cluster, timeseries = addClusterMetrics(size2cluster, mgp, dataset, tweets_indexes, tweet_tokens)
    size2cluster = sorted(size2cluster, key=lambda x: x[order_pos], reverse=True)
    
    return size2cluster, timeseries


'''
Compute the cluster algorithm on a dataset

Parameters:
    dataset: kpi6 dataset
    minddate: from date
    maxdate: to date
    K: # cluster upper bound
    iterations: of the clustering algorithm
    topk: most representative words
    order_pos: sort result by position of the tuple in the result
    
Returns:
    cluster_aggregated_data (see #addClusterMetrics)
    timeseries (see #get_cluster_timeseries)
    cluster algorithm object
'''
def compute_clusters_between_dates(dataset, mindate, maxdate,
                                   K, iterations, topk, order_pos = 4):
    
    selecteddata = get_data_between(dataset, mindate, maxdate)
    tweets_indexes = selecteddata.text.dropna().index

    processedtweets = transform_social_text(selecteddata.text.dropna())
    processedtweets = [tokenize_tweet(tweet) for tweet in processedtweets]
    processedtweets = list(filter(lambda x: len(x[1]) > 0, zip(tweets_indexes, processedtweets)))

    tweets_indexes = list(map(lambda x:x[0], processedtweets))
    tweet_tokens = list(map(lambda x:x[1], processedtweets))
    
    vocab, reverse_vocab, unk = get_vocabularies(tweet_tokens)
    
    clusters = traing_cluster_algo(tweet_tokens, K, iterations)
    
    aggregated_data, timeseries = extract_cluster(clusters, dataset, topk, tweets_indexes, tweet_tokens, order_pos)
    
    return aggregated_data, timeseries, clusters, ( vocab, reverse_vocab, unk), tweet_tokens

In [154]:
def init_bottom(bottom, df):
    for i in df.index:
        if i not in bottom:
            bottom[i] = 0

def get_x_y(indexes, vals, bottom):
    keys = sorted(list(bottom.keys()))
    
    vals = [vals[i] if i in indexes else 0 for i in keys]
    return pd.DatetimeIndex(keys), vals

def update_bottom(bottom, bottom_df):
    for ix in bottom_df.index:
        bottom[ix] += bottom_df.loc[ix]['val']
        

def get_bottom_series(bottom):
    keys = sorted(list(bottom.keys()))
    return [bottom[k] for k in keys]



def get_timeseries(series):
    %matplotlib inline
    seriesc = series.copy()
    seriesc['val'] = 1

    seriesc['date_index'] =  pd.to_datetime(seriesc.date_created_at,  errors='coerce')

    ts = seriesc.groupby(['cluster', pd.Grouper(key='date_index', freq='d')]).count()
    multindex = ts.index
    clusters = multindex.get_level_values(0).unique()
    return [ts.loc[i] for i in clusters]


def print_topic_chart(series, clusters):
    clusters_sorted_by_num = sorted(clusters, key=lambda x:x[1])
    topwords = list(map(lambda x:" ".join(x[2].split(' ')[:7]), clusters_sorted_by_num))
    
    # get index
    indexes = np.argsort(list(map(lambda x:x[0], clusters_sorted_by_num)))[::-1]
    
    
    colors = dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS)

    # Sort colors by hue, saturation, value and name.
    by_hsv = sorted((tuple(mcolors.rgb_to_hsv(mcolors.to_rgba(color)[:3])), name)
                    for name, color in colors.items())
    sorted_names = [name for hsv, name in by_hsv]
    np.random.shuffle(sorted_names)

    ss=get_timeseries(series)
    
    fig = plt.figure(figsize=(10, 8), constrained_layout=True)
    axs = fig.subplots(1,1)
    bottom = {}
    
    # get all indexes
    
    for i in range(len(ss)):
        init_bottom(bottom, ss[i])
        
    for i in range(len(ss)):
        plt.sca(axs)
        plt.xticks(rotation=45)

        axs.set_title('Volumes')
        axs.grid(True)
        
        if i != 0:
            s_indexes = ss[indexes[i]].index
            
            bottom_df = ss[indexes[i-1]]
            update_bottom(bottom, bottom_df)
            bottom_series = get_bottom_series(bottom)
            X, y = get_x_y(ss[indexes[i]].index, ss[indexes[i]]['val'], bottom)
            
            axs.bar(X, y, bottom=bottom_series, color=sorted_names[indexes[i]])
        else:
            X, y = get_x_y(ss[indexes[i]].index, ss[indexes[i]]['val'], bottom)
            axs.bar(X, y, color=sorted_names[indexes[i]])

    axs.legend(np.array(sorted_names)[indexes], labels=np.array(topwords)[indexes], loc ="upper left")


In [155]:
nutella = pd.read_csv('data/nutella.csv') 
nutellanospam = preprocess_dataset(nutella)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['date_index'] = pd.to_datetime(dataset.date_created_at, errors='coerce')


In [156]:
print(nutellanospam)

         social                   id                              topic_id  \
393301  twitter  1178808882996666368  d0f0c411-fca1-4412-8508-1348438d1db9   
393300  twitter  1178810988868591616  d0f0c411-fca1-4412-8508-1348438d1db9   
393299  twitter  1178815313313488898  d0f0c411-fca1-4412-8508-1348438d1db9   
393298  twitter  1178816391421542400  d0f0c411-fca1-4412-8508-1348438d1db9   
393297  twitter  1178820636917059589  d0f0c411-fca1-4412-8508-1348438d1db9   
...         ...                  ...                                   ...   
4       twitter  1362453782257422336  d0f0c411-fca1-4412-8508-1348438d1db9   
3       twitter  1363047761239769088  d0f0c411-fca1-4412-8508-1348438d1db9   
2       twitter  1363063651238731782  d0f0c411-fca1-4412-8508-1348438d1db9   
1       twitter  1363074677300690944  d0f0c411-fca1-4412-8508-1348438d1db9   
0       twitter  1363096473848659972  d0f0c411-fca1-4412-8508-1348438d1db9   

       rule_id is_comment                                      

In [157]:
# Drops inconsistent columns
nutella_no_na = nutellanospam.dropna(1)

In [158]:
nutella_no_na

Unnamed: 0,social,id,topic_id,rule_id,is_comment,text,lang_value,date_created_at,counts_following,counts_followers,...,counts_likes,score_engagement,user_id,user_username,user_name,user_profile_picture,user_gender_value,user_lang_source,user_verified,date_index
393301,twitter,1178808882996666368,d0f0c411-fca1-4412-8508-1348438d1db9,9161.0,False,@thekryptikrose Thanks. I just squeezed my #Nu...,en,2019-09-30T23:08:33.000Z,12684.0,14863.0,...,2.0,0.013366,1172501761430556672.0,misplacedcomma2,misplaced comma,https://pbs.twimg.com/profile_images/117439088...,unknown,provided,0.0,2019-09-30 23:08:33+00:00
393300,twitter,1178810988868591616,d0f0c411-fca1-4412-8508-1348438d1db9,9161.0,False,I don’t give a fuck #Nutella little bitch,en,2019-09-30T23:16:55.000Z,3.0,131.0,...,0.0,0.0,1053744668.0,Felony514,Flesh-N-Felony,https://pbs.twimg.com/profile_images/125563236...,unknown,provided,0.0,2019-09-30 23:16:55+00:00
393299,twitter,1178815313313488898,d0f0c411-fca1-4412-8508-1348438d1db9,9161.0,False,Nutella with the mood lighting #crestedgecko #...,en,2019-09-30T23:34:06.000Z,214.0,127.0,...,1.0,0.969163,38630434.0,EmmyStormborn,EmeraldStormborn,https://pbs.twimg.com/profile_images/115176010...,unknown,provided,0.0,2019-09-30 23:34:06+00:00
393298,twitter,1178816391421542400,d0f0c411-fca1-4412-8508-1348438d1db9,9161.0,False,Today only! #Crepes #delicious #savory #yummy ...,en,2019-09-30T23:38:23.000Z,71.0,20.0,...,0.0,0.0,1285092918.0,tcbar_ontario,The Chocolate Bar Ontario,https://pbs.twimg.com/profile_images/116511029...,unknown,detected,0.0,2019-09-30 23:38:23+00:00
393297,twitter,1178820636917059589,d0f0c411-fca1-4412-8508-1348438d1db9,9161.0,False,Hazelnut Time!! Now serving the Hazelnut Nutel...,en,2019-09-30T23:55:15.000Z,13.0,36.0,...,2.0,1.470588,203400288.0,Cheesecakewastd,Sweet Obsession Cheesecakes,https://pbs.twimg.com/profile_images/100961249...,female,provided,0.0,2019-09-30 23:55:15+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,twitter,1362453782257422336,d0f0c411-fca1-4412-8508-1348438d1db9,9161,false,What is your unpopular opinion? Do you think B...,en,2021-02-18T17:27:49.000Z,739.0,208.0,...,0.0,0.0,2973293529.0,hilebryson,Bryson Hile,https://pbs.twimg.com/profile_images/136022794...,male,provided,0.0,2021-02-18 17:27:49+00:00
3,twitter,1363047761239769088,d0f0c411-fca1-4412-8508-1348438d1db9,9161,false,Making up for Pancake Tuesday on Saturday with...,en,2021-02-20T08:48:05.000Z,46.0,30.0,...,0.0,0.0,596341763.0,SimonPastorello,Simon Pastorello,https://pbs.twimg.com/profile_images/114632529...,male,provided,0.0,2021-02-20 08:48:05+00:00
2,twitter,1363063651238731782,d0f0c411-fca1-4412-8508-1348438d1db9,9161,false,Nutella is actually chocolate ki chutney😋 #fac...,en,2021-02-20T09:51:13.000Z,5.0,0.0,...,0.0,0.0,1355421219852484608.0,deivasri_v,Deivasri V,https://pbs.twimg.com/profile_images/136092300...,unknown,provided,0.0,2021-02-20 09:51:13+00:00
1,twitter,1363074677300690944,d0f0c411-fca1-4412-8508-1348438d1db9,9161,false,@VanessaFiji @orangulandtrust @griffjane @oran...,en,2021-02-20T10:35:02.000Z,4950.0,4175.0,...,0.0,0.0,424144191.0,1BJDJ,Ben 🙂,https://pbs.twimg.com/profile_images/127613935...,male,provided,0.0,2021-02-20 10:35:02+00:00


In [159]:
with open(file='data/authority_sets.txt', mode='r') as f:
    category_list = []
    category = f.readline()[:-1]
    category_list.append(category)
    
    authority_sets = {}
    authority_sets[category] = []
    
    lines = f.readlines()
    for line in lines:
        if(line[0] != '@'):
            if(line != '\n'):
                category = line[:-1]
                category_list.append(category)
                authority_sets[category] = []
        else:
            if(line[-1:] == '\n'):
                authority_sets[category].append(line[1:-1])
            else:
                authority_sets[category].append(line[1:])

for i in authority_sets:
    print(i, authority_sets[i])

categories = Enumerate(category_list)
print(categories)

Politici ['matteosalvinimi', 'GiorgiaMeloni', 'EnricoLetta ', 'nzingaretti ', 'MonicaCirinna ', 'elenabonetti', 'matteorenzi', 'RossellaMuroni', 'luigidimaio', 'ale_dibattista', 'beppe_grillo', 'GiuseppeConteIT', 'NFratoianni', 'pbersani', 'pdnetwork', 'bobogiac', 'gasparripdl ', 'FratellidItalia', 'LegaSalvini', 'Capezzone', 'borghi_claudio', 'berlusconi', 'forza_italia', 'DarioNardella', 'carlaruocco1', 'gualtierieurope', 'BeaLorenzin', 'robersperanza', 'dariofrance', 'DaniloToninelli', 'BeppeSala', 'ellyesse', 'ElioLannutti', 'sbonaccini', 'marcocappato', 'PietroGrasso', 'ItaliaViva', 'zaiapresidente', 'TeresaBellanova', 'Azione_it', 'GuidoCrosetto', 'gparagone']
Esperti di settore e giornalisti ['DarioBressanini', 'sabri_giannini', 'RudyBandiera', 'la_kuzzo', 'M_gabanelli', 'robertosaviano', 'petergomezblog', 'corradoformigli', 'IaconaRiccardo', 'marcotravaglio', 'AndreaScanzi', 'lucatelese', 'stanzaselvaggia', 'concitadeg', 'giucruciani', 'mariogiordano5', 'Tommasolabate', 'DAVIDP