In [33]:
!pip install -r requirements.txt



In [34]:
# from .full.Gensim_Doc_Modelling import *
# import ipynb.fs.defs.Topic_Analysis

from gsdmm_master.gsdmm import MovieGroupProcess
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import Counter


from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from matplotlib import colors as mcolors
import scipy.sparse as sp

In [35]:
def preprocess_dataset(dataset):
    dataset = dataset.iloc[dataset.date_created_at.dropna().index]
    dataset['date_index'] = pd.to_datetime(dataset.date_created_at, errors='coerce')
    dataset = dataset.loc[dataset['date_index'].dropna().index]
    
    datasetusers = []

    for mdate, df in dataset.groupby(pd.Grouper(key='date_index', freq='d')):
        datasetusers.append(df.loc[df.user_id.drop_duplicates().index])

    return pd.concat(datasetusers)

def Map(func, lst):
    return list(map(func, lst))

def Filter(func, lst):
    return list(filter(func, lst))

def Enumerate(lst):
    return list(enumerate(lst))

In [36]:
def get_data_between(dataset, mindate, maxdate):
    lowerbound = dataset[dataset.date_index > mindate]
    return lowerbound[lowerbound.date_index < maxdate]

In [37]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alxau\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [38]:
tokenizer = TweetTokenizer()

sw = stopwords.words('english')

sw = sw + [',', '.', '!', '?', '’', '‘', '(', ')', '”', '“', '@', '/', '-']

def transform_social_text(sentences, lower=True):
    hashtags = re.compile(r"^#\S+|\s#\S+")
    mentions = re.compile(r"^@\S+|\s@\S+")
    urls = re.compile(r"https?://\S+")

    def process_text(text):
        if lower:
            text = text.lower()
        text = hashtags.sub('', text)
        text = mentions.sub('', text)
        text = urls.sub('', text)
        text = text.replace('\n', ' .')

        return text.strip()
    
    return list(map(process_text, sentences))

def tokenize_tweet(tweet):
    word_tokens = tokenizer.tokenize(tweet)
    word_tokens = [w for w in word_tokens if not w in sw]
    return word_tokens

In [39]:
def traing_cluster_algo(data, K, iterations):
    mgp = MovieGroupProcess(K=K, alpha=0.1, beta=0.1, n_iters=iterations)
    mgp.fit(data, 1000)
    return mgp

In [40]:
def get_vocabularies(tokens):
    cc = Counter()
    for lst in tokens:
        cc.update(lst)

    vocab = dict([(i, v) for i, v in list(enumerate(map(lambda x :x[0], cc.most_common())))[:10000]])
    
    ## Add Unk Token
    vocab[len(vocab)] = '$$unk$$'
    
    reverse_vocab = dict([(v , i) for i, v in vocab.items()])
    return vocab, reverse_vocab, '$$unk$$'

# Get Cluster numbers by date
def get_cluster_timeseries(df, mdf):
    timeseries = pd.merge(mdf['cluster'], df[['date_created_at']], how='left', left_index=True, right_index=True)
    return timeseries
    
'''
Returns a tuple of 
    (cluster_size,
    cluster_id, 
    topwords,
    most representative tweet, 
    avg engagement,
    avg followers, 
    score)
'''
def addClusterMetrics(size2cluster, clusters, df, tweets_indexes, tweet_tokens):
    mdf = pd.DataFrame([(mid, np.argmax(clusters.score(doc)), max(clusters.score(doc)), doc) for mid, doc in zip(tweets_indexes, tweet_tokens)])
    mdf = mdf.set_index([0])
    mdf.columns = ['cluster', 'score', 'text']
    
    # Cluster Id x Date
    timeseries = get_cluster_timeseries(df, mdf)
   
    # Get Centroids (tweets)
    result = pd.merge(mdf, df[['text', 'score_engagement', 'counts_followers']], how='left', left_index=True, right_index=True)
    # Aggregate data by cluster (by avg)
    aggregations = result.fillna(0).groupby('cluster').mean()
    idx = result.groupby(['cluster'])['score'].transform(max) == mdf['score']
    rr = result[idx].groupby(['cluster']).first()
    centroids = pd.merge(aggregations, rr, how='left', on='cluster')['text_y']
    
    # Get cluster ids
    cluster_numbers = mdf['cluster'].unique()
    cluster_numbers.sort()
    
    return list(zip(map(lambda x:x[0], size2cluster), 
             cluster_numbers,
             [" ".join([w for w, s in lst]) for lst in map(lambda x:x[1], size2cluster)],
             centroids,
             aggregations['score_engagement'], 
             aggregations['counts_followers'],
             (1 + aggregations['score_engagement']) * np.array(list(map(lambda x:x[0], size2cluster)))  )), timeseries


'''Returns cluster aggregated data and the volume timeseries of clusters'''
def extract_cluster(mgp, dataset, topk, tweets_indexes, tweet_tokens, order_pos=4):
    
    doc_count = mgp.cluster_doc_count
    word_distribution = [
        sorted(list(cluster.items()), key=lambda x: x[1], reverse=True)[:topk]
        for cluster in mgp.cluster_word_distribution
    ]
    
    
    size2cluster = [(d, w) for d, w in zip(doc_count, word_distribution) if d > 0]
    print(mgp.cluster_doc_count)
    size2cluster, timeseries = addClusterMetrics(size2cluster, mgp, dataset, tweets_indexes, tweet_tokens)
    size2cluster = sorted(size2cluster, key=lambda x: x[order_pos], reverse=True)
    
    return size2cluster, timeseries


'''
Compute the cluster algorithm on a dataset

Parameters:
    dataset: kpi6 dataset
    minddate: from date
    maxdate: to date
    K: # cluster upper bound
    iterations: of the clustering algorithm
    topk: most representative words
    order_pos: sort result by position of the tuple in the result
    
Returns:
    cluster_aggregated_data (see #addClusterMetrics)
    timeseries (see #get_cluster_timeseries)
    cluster algorithm object
'''
def compute_clusters_between_dates(dataset, mindate, maxdate,
                                   K, iterations, topk, order_pos = 4):
    
    selecteddata = get_data_between(dataset, mindate, maxdate)
    tweets_indexes = selecteddata.text.dropna().index

    processedtweets = transform_social_text(selecteddata.text.dropna())
    processedtweets = [tokenize_tweet(tweet) for tweet in processedtweets]
    processedtweets = list(filter(lambda x: len(x[1]) > 0, zip(tweets_indexes, processedtweets)))

    tweets_indexes = list(map(lambda x:x[0], processedtweets))
    tweet_tokens = list(map(lambda x:x[1], processedtweets))
    
    vocab, reverse_vocab, unk = get_vocabularies(tweet_tokens)
    
    clusters = traing_cluster_algo(tweet_tokens, K, iterations)
    
    aggregated_data, timeseries = extract_cluster(clusters, dataset, topk, tweets_indexes, tweet_tokens, order_pos)
    
    return aggregated_data, timeseries, clusters, ( vocab, reverse_vocab, unk), tweet_tokens

In [41]:
def init_bottom(bottom, df):
    for i in df.index:
        if i not in bottom:
            bottom[i] = 0

def get_x_y(indexes, vals, bottom):
    keys = sorted(list(bottom.keys()))
    
    vals = [vals[i] if i in indexes else 0 for i in keys]
    return pd.DatetimeIndex(keys), vals

def update_bottom(bottom, bottom_df):
    for ix in bottom_df.index:
        bottom[ix] += bottom_df.loc[ix]['val']
        

def get_bottom_series(bottom):
    keys = sorted(list(bottom.keys()))
    return [bottom[k] for k in keys]



def get_timeseries(series):
    %matplotlib inline
    seriesc = series.copy()
    seriesc['val'] = 1

    seriesc['date_index'] =  pd.to_datetime(seriesc.date_created_at,  errors='coerce')

    ts = seriesc.groupby(['cluster', pd.Grouper(key='date_index', freq='d')]).count()
    multindex = ts.index
    clusters = multindex.get_level_values(0).unique()
    return [ts.loc[i] for i in clusters]


def print_topic_chart(series, clusters):
    clusters_sorted_by_num = sorted(clusters, key=lambda x:x[1])
    topwords = list(map(lambda x:" ".join(x[2].split(' ')[:7]), clusters_sorted_by_num))
    
    # get index
    indexes = np.argsort(list(map(lambda x:x[0], clusters_sorted_by_num)))[::-1]
    
    
    colors = dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS)

    # Sort colors by hue, saturation, value and name.
    by_hsv = sorted((tuple(mcolors.rgb_to_hsv(mcolors.to_rgba(color)[:3])), name)
                    for name, color in colors.items())
    sorted_names = [name for hsv, name in by_hsv]
    np.random.shuffle(sorted_names)

    ss=get_timeseries(series)
    
    fig = plt.figure(figsize=(10, 8), constrained_layout=True)
    axs = fig.subplots(1,1)
    bottom = {}
    
    # get all indexes
    
    for i in range(len(ss)):
        init_bottom(bottom, ss[i])
        
    for i in range(len(ss)):
        plt.sca(axs)
        plt.xticks(rotation=45)

        axs.set_title('Volumes')
        axs.grid(True)
        
        if i != 0:
            s_indexes = ss[indexes[i]].index
            
            bottom_df = ss[indexes[i-1]]
            update_bottom(bottom, bottom_df)
            bottom_series = get_bottom_series(bottom)
            X, y = get_x_y(ss[indexes[i]].index, ss[indexes[i]]['val'], bottom)
            
            axs.bar(X, y, bottom=bottom_series, color=sorted_names[indexes[i]])
        else:
            X, y = get_x_y(ss[indexes[i]].index, ss[indexes[i]]['val'], bottom)
            axs.bar(X, y, color=sorted_names[indexes[i]])

    axs.legend(np.array(sorted_names)[indexes], labels=np.array(topwords)[indexes], loc ="upper left")


In [42]:
with open(file='data/authority_sets.txt', mode='r') as f:
    category_list = []
    category = f.readline()[:-1]
    category_list.append(category)
    
    authority_sets = {}
    authority_sets[category] = []
    
    lines = f.readlines()
    for line in lines:
        if(line[0] != '@'):
            if(line != '\n'):
                category = line[:-1]
                category_list.append(category)
                authority_sets[category] = []
        else:
            if(line[-1:] == '\n'):
                authority_sets[category].append(line[1:-1])
            else:
                authority_sets[category].append(line[1:])

for i in authority_sets:
    print(i, authority_sets[i])

categories = Enumerate(category_list)
print(categories)

Politici ['matteosalvinimi', 'GiorgiaMeloni', 'EnricoLetta ', 'nzingaretti ', 'MonicaCirinna ', 'elenabonetti', 'matteorenzi', 'RossellaMuroni', 'luigidimaio', 'ale_dibattista', 'beppe_grillo', 'GiuseppeConteIT', 'NFratoianni', 'pbersani', 'pdnetwork', 'bobogiac', 'gasparripdl ', 'FratellidItalia', 'LegaSalvini', 'Capezzone', 'borghi_claudio', 'berlusconi', 'forza_italia', 'DarioNardella', 'carlaruocco1', 'gualtierieurope', 'BeaLorenzin', 'robersperanza', 'dariofrance', 'DaniloToninelli', 'BeppeSala', 'ellyesse', 'ElioLannutti', 'sbonaccini', 'marcocappato', 'PietroGrasso', 'ItaliaViva', 'zaiapresidente', 'TeresaBellanova', 'Azione_it', 'GuidoCrosetto', 'gparagone']
Esperti di settore e giornalisti ['DarioBressanini', 'sabri_giannini', 'RudyBandiera', 'la_kuzzo', 'M_gabanelli', 'robertosaviano', 'petergomezblog', 'corradoformigli', 'IaconaRiccardo', 'marcotravaglio', 'AndreaScanzi', 'lucatelese', 'stanzaselvaggia', 'concitadeg', 'giucruciani', 'mariogiordano5', 'Tommasolabate', 'DAVIDP

In [43]:
# filters tweets by a subset of authorities
def get_authority_tweets(auth_list, df):
   return df[df['user_username'].isin(auth_list)]

# filters tweets by multiple subsets of authorities and outputs their union
def get_authority_tweets_multiple_categories(auth_cat_list, df):
   dflist = []
   for i, value in enumerate(auth_cat_list):
      authority_set = authority_sets[value]
      dflist.append(df[df['user_username'].isin(authority_set)])
   return pd.concat(dflist).drop_duplicates()

In [44]:
ferrero_corp = pd.read_csv('data/ferrero_corporate.csv') 
ferrero_corpnospam = preprocess_dataset(ferrero_corp)
ferrero_corp_no_na = ferrero_corpnospam.copy()
# ferrero_corp_no_na = ferrero_corpnospam.dropna(1)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['date_index'] = pd.to_datetime(dataset.date_created_at, errors='coerce')


In [45]:
ferrero_corp_auth = get_authority_tweets_multiple_categories(category_list, ferrero_corp_no_na)
ferrero_corp_auth

Unnamed: 0,social,id,topic_id,rule_id,is_comment,text,lang_value,date_created_at,counts_following,counts_followers,...,counts_likes,score_engagement,user_id,user_username,user_name,user_profile_picture,user_gender_value,user_lang_source,user_verified,date_index
34648,twitter,1174618145786650624,7ee62d5c-5811-45ce-9cef-c09d143f041a,12559.0,False,La Ferrero riconosce a 6000 dipendenti un prem...,it,2019-09-19T09:36:03.000Z,919.0,1115932.0,...,4649.0,0.519143,1.305370e+08,GiorgiaMeloni,Giorgia Meloni ?? ن,https://pbs.twimg.com/profile_images/113404761...,female,provided,1.0,2019-09-19 09:36:03+00:00
34065,twitter,1176441842096910336,7ee62d5c-5811-45ce-9cef-c09d143f041a,12559.0,False,Il Ministro Fioramonti si permette di giustifi...,it,2019-09-24T10:22:47.000Z,2192.0,156189.0,...,3477.0,3.001747,4.139162e+08,GuidoCrosetto,Guido Crosetto,https://pbs.twimg.com/profile_images/136781770...,male,provided,0.0,2019-09-24 10:22:47+00:00
33847,twitter,1176748033066844160,7ee62d5c-5811-45ce-9cef-c09d143f041a,12559.0,False,@theoddmother @lofioramonti In Italia la Ferre...,it,2019-09-25T06:39:28.000Z,2192.0,156189.0,...,3.0,0.001920,4.139162e+08,GuidoCrosetto,Guido Crosetto,https://pbs.twimg.com/profile_images/136781770...,male,provided,0.0,2019-09-25 06:39:28+00:00
32534,twitter,1183821988437135362,7ee62d5c-5811-45ce-9cef-c09d143f041a,12559.0,False,@gianni_la_marca @angelofilippi Per il mercato...,it,2019-10-14T19:08:51.000Z,2192.0,156189.0,...,0.0,0.000000,4.139162e+08,GuidoCrosetto,Guido Crosetto,https://pbs.twimg.com/profile_images/136781770...,male,provided,0.0,2019-10-14 19:08:51+00:00
32064,twitter,1186949229438263297,7ee62d5c-5811-45ce-9cef-c09d143f041a,12559.0,False,Siamo in un pericoloso paradosso con il Paese ...,it,2019-10-23T10:15:23.000Z,507.0,181802.0,...,35.0,0.021880,1.351476e+07,LegaSalvini,Lega - Salvini Premier,https://pbs.twimg.com/profile_images/943884100...,unknown,provided,1.0,2019-10-23 10:15:23+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1013,twitter,1418810115445166080,7ee62d5c-5811-45ce-9cef-c09d143f041a,12559.0,False,"Ferrero, la strategia green del big di Alba (è...",it,2021-07-24T05:47:46.000Z,262.0,2395017.0,...,7.0,0.000493,3.952189e+08,Corriere,Corriere della Sera,https://pbs.twimg.com/profile_images/134684048...,unknown,provided,1.0,2021-07-24 05:47:46+00:00
841,twitter,1420717382830415880,7ee62d5c-5811-45ce-9cef-c09d143f041a,12559.0,False,Ferrero l'azienda meglio percepita sul territo...,it,2021-07-29T12:06:34.000Z,15.0,110805.0,...,7.0,0.007394,2.996339e+07,MilanoFinanza,MilanoFinanza,https://pbs.twimg.com/profile_images/132867798...,unknown,provided,0.0,2021-07-29 12:06:34+00:00
822,twitter,1421105542945202182,7ee62d5c-5811-45ce-9cef-c09d143f041a,12559.0,False,"L’attimo fuggente, online il numero di agosto ...",it,2021-07-30T13:48:58.000Z,424.0,3360.0,...,0.0,0.000000,5.459782e+08,lamescolanza,L'attimo fuggente - Lamescolanza,https://pbs.twimg.com/profile_images/204189121...,unknown,provided,0.0,2021-07-30 13:48:58+00:00
467,twitter,1426075939201421313,7ee62d5c-5811-45ce-9cef-c09d143f041a,12559.0,False,"Mediobanca, Gavio arrotonda la quota. I Ferrer...",it,2021-08-13T06:59:33.000Z,520.0,68044.0,...,0.0,0.001761,3.552595e+07,Affaritaliani,Affaritaliani.it,https://pbs.twimg.com/profile_images/135000774...,unknown,provided,1.0,2021-08-13 06:59:33+00:00


In [46]:
ferrero_corp_auth.to_csv('data/authority_filtered/ferrero_corp_auth.csv')

In [47]:
# istante in cui gli utenti hanno tweettato, e variazione di volume nel periodo immediatamente adiacente
# 

In [48]:
ferrero_corp_spikes = pd.read_csv('data/ferrero_corporate_spikes.csv')

In [49]:
ferrero_corp_authority_spikes

NameError: name 'ferrero_corp_authority_spikes' is not defined