# # Exploratory analysis on timelines tweets

In this dataset, we extract the timelines (last 200 tweets) for all users interacting with the three popular users

- MashiRafael (politics)
- aguschmer (sports)
- KarlaMoralesR (activism on humanitarian aid)

In [4]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import csv
import os
from tqdm import tqdm

In [None]:
ds = pd.read_csv('../data/convusersec/twconvrsu_csv_v2i_60k/train.csv')
ds.info()

In [None]:
ds.Context.str.len().plot.hist(bins=100)
plt.yscale('log')

In [None]:
ds.Utterance.str.len().plot.hist(bins=100)
plt.yscale('log')

In [24]:
pusers = {}
pwords = {}
timelines = {}

n_tweets = 0
descriptions = {}

with open('../data/convusersec/timelines_raw.csv', 'r') as f:
    reader = csv.DictReader(f)
    next(reader)
    prior_screen_name = ''
    description = ''
    tweets = []
    
    for i, row in tqdm(enumerate(reader), total=profiles.shape[0]):
        
        username = row['screen_name']
        pusers[username] = 1 if username not in pusers else pusers[username] + 1
        
        if prior_screen_name != username:
            timeline = ' '.join(tweets)
            wlist = timeline.split()
            
            for t in wlist:
                pwords[t] = 1
            
            timelines[prior_screen_name] = len(wlist)
            prior_screen_name = username
            tweets =[]

            if len(description) > 0:
                descriptions[prior_screen_name] = len(description.split())
        
        tweet = row['text']          
        description = row['description']
        tweets.append(tweet)
        
        
n_tweets = i + 1

100%|█████████▉| 14860508/14860509 [08:49<00:00, 28053.28it/s]


In [42]:
del timelines['']
tl_words = np.array(list(timelines.values()))
tl_tweets = np.array(list(pusers.values()))

In [46]:
stats =[]
stats.append( ['users', len(pusers)] )
stats.append( ['desc', len(descriptions)] )
stats.append( ['tweets', n_tweets] )
stats.append( ['words', len(pwords)] )

stats.append( ['min tweets', tl_tweets.min()] )
stats.append( ['avg tweets', tl_tweets.mean()] )
stats.append( ['max tweets', tl_tweets.max()] )
stats.append( ['median tweet', np.median(tl_tweets)] )


stats.append( ['min words', tl_words.min()] )
stats.append( ['avg words', tl_words.mean()] )
stats.append( ['max words', tl_words.max()] )
stats.append( ['median words', np.median(tl_words)] )

stats_df = pd.DataFrame(stats, columns=['stat', 'value'], dtype=object)
stats_df

Unnamed: 0,stat,value
0,users,120220.0
1,desc,71142.0
2,tweets,14860508.0
3,words,13607961.0
4,min tweets,1.0
5,avg tweets,123.611
6,max tweets,39436.0
7,median tweet,74.0
8,min words,1.0
9,avg words,1571.77


In [47]:
stats_df.to_csv('../results/ds_stats_timelines.csv', index=False)

## who are those outliers??

In [44]:
for k,v in pusers.items():
    if v > 30000:
        print(k, v)

eluniversocom 31637
trendinaliaEC 39436


# trec profiles

In [7]:
data_dir = '~/data/twconv/trec/dialogs'

In [14]:
path = os.path.join(data_dir,'traindiags.csv')
ds = pd.read_csv(path)
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26769 entries, 0 to 26768
Data columns (total 16 columns):
id                       26769 non-null int64
screen_name              26769 non-null object
text                     26769 non-null object
in_reply_to_status_id    20764 non-null float64
created_at               26769 non-null object
latitude                 65 non-null float64
longitude                65 non-null float64
lang                     26769 non-null object
timestamp                26757 non-null object
conversation_id          26769 non-null int64
conversation_deep        26769 non-null int64
num_replies              26769 non-null int64
num_users                26769 non-null int64
url                      26769 non-null object
dialog_id                26769 non-null int64
turns                    26769 non-null int64
dtypes: float64(3), int64(7), object(6)
memory usage: 3.3+ MB


In [15]:
path = os.path.join(data_dir,'validdiags.csv')
dsval = pd.read_csv(path)
dsval.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2025 entries, 0 to 2024
Data columns (total 16 columns):
id                       2025 non-null int64
screen_name              2025 non-null object
text                     2025 non-null object
in_reply_to_status_id    2025 non-null float64
created_at               2025 non-null object
latitude                 2 non-null float64
longitude                2 non-null float64
lang                     2025 non-null object
timestamp                2024 non-null object
conversation_id          2025 non-null int64
conversation_deep        2025 non-null int64
num_replies              2025 non-null int64
num_users                2025 non-null int64
url                      2025 non-null object
dialog_id                2025 non-null int64
turns                    2025 non-null int64
dtypes: float64(3), int64(7), object(6)
memory usage: 253.2+ KB


In [19]:
trainusers = set(ds.screen_name.unique())
validusers = set(dsval.screen_name.unique())
len(trainusers), len(validusers), len(trainusers.intersection(validusers))

(8034, 1316, 1284)

In [20]:
path = os.path.join(data_dir,'testdiags.csv')
dstest = pd.read_csv(path)
dstest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2025 entries, 0 to 2024
Data columns (total 16 columns):
id                       2025 non-null int64
screen_name              2025 non-null object
text                     2025 non-null object
in_reply_to_status_id    2025 non-null float64
created_at               2025 non-null object
latitude                 3 non-null float64
longitude                3 non-null float64
lang                     2025 non-null object
timestamp                2025 non-null object
conversation_id          2025 non-null int64
conversation_deep        2025 non-null int64
num_replies              2025 non-null int64
num_users                2025 non-null int64
url                      2025 non-null object
dialog_id                2025 non-null int64
turns                    2025 non-null int64
dtypes: float64(3), int64(7), object(6)
memory usage: 253.2+ KB


In [24]:
testusers = set(dstest.screen_name.unique())
len(trainusers), len(testusers), len(trainusers.intersection(testusers)),len(validusers.intersection(testusers))

(8034, 1299, 1267, 569)