In [58]:
import pandas as pd 
import glob
import os
import numpy as np
import datetime
import re

In [2]:
PATH_DATA = "../../data"

## Loading Phase

In [3]:
filenames = glob.glob(os.path.join(PATH_DATA, "*.csv"))
print(filenames)
df = pd.concat((pd.read_csv(f) for f in filenames))
print(df.shape)
df.head()

['../../data/IRAhandle_tweets_4.csv', '../../data/IRAhandle_tweets_2.csv', '../../data/IRAhandle_tweets_6.csv', '../../data/IRAhandle_tweets_1.csv', '../../data/IRAhandle_tweets_3.csv', '../../data/IRAhandle_tweets_5.csv', '../../data/IRAhandle_tweets_7.csv', '../../data/IRAhandle_tweets_9.csv', '../../data/IRAhandle_tweets_8.csv']
(2973371, 15)


Unnamed: 0,external_author_id,author,content,region,language,publish_date,harvested_date,following,followers,updates,post_type,account_type,new_june_2018,retweet,account_category
0,1674084000.0,GAB1ALDANA,People are too toxic. I think I have people po...,United States,English,7/30/2016 20:15,7/30/2016 20:15,3395,2014,2150,RETWEET,Hashtager,0,1,HashtagGamer
1,1674084000.0,GAB1ALDANA,#NowPlaying Don't Shoot (I'm a Man) by @DEVO -...,United States,English,7/30/2016 20:15,7/30/2016 20:15,3395,2014,2146,RETWEET,Hashtager,0,1,HashtagGamer
2,1674084000.0,GAB1ALDANA,the 'I'm the most boring person in the world' ...,United States,English,7/30/2016 20:16,7/30/2016 20:16,3395,2013,2159,RETWEET,Hashtager,0,1,HashtagGamer
3,1674084000.0,GAB1ALDANA,#MyAchillesHeel slippery floors https://t.co/R...,United States,Norwegian,7/30/2016 20:16,7/30/2016 20:16,3395,2013,2160,RETWEET,Hashtager,0,1,HashtagGamer
4,1674084000.0,GAB1ALDANA,#MyAchillesHeel Boring narcissists.....nothing...,United States,English,7/30/2016 20:16,7/30/2016 20:16,3395,2013,2158,RETWEET,Hashtager,0,1,HashtagGamer


## Description of the dataset

In [4]:
float(df.loc[df['region'] == 'United States','region'].count())/float(df['region'].count())

0.6934938715370541

In [5]:
df.dtypes

external_author_id    float64
author                 object
content                object
region                 object
language               object
publish_date           object
harvested_date         object
following               int64
followers               int64
updates                 int64
post_type              object
account_type           object
new_june_2018           int64
retweet                 int64
account_category       object
dtype: object

In [6]:
df.describe(include="all")

Unnamed: 0,external_author_id,author,content,region,language,publish_date,harvested_date,following,followers,updates,post_type,account_type,new_june_2018,retweet,account_category
count,2973367.0,2973371,2973370,2964528,2973371,2973371,2973371,2973371.0,2973371.0,2973371.0,1310946,2973008,2973371.0,2973371.0,2973371
unique,,2848,2365942,36,56,896684,906316,,,,2,20,,,8
top,,EXQUOTE,В городе Сочи. Олимпиада – праздник или стихий...,United States,English,8/16/2017 1:29,3/22/2016 17:35,,,,RETWEET,Russian,,,NonEnglish
freq,,59652,670,2055882,2128963,202,1333,,,,1270702,721191,,,837725
mean,1.296128e+17,,,,,,,3433.524,7018.913,10497.56,,,0.2078735,0.4408955,
std,3.036341e+17,,,,,,,5609.881,14584.63,17687.29,,,0.4057859,0.4964945,
min,34976400.0,,,,,,,-1.0,-1.0,-1.0,,,0.0,0.0,
25%,1930748000.0,,,,,,,327.0,320.0,1787.0,,,0.0,0.0,
50%,2581835000.0,,,,,,,1499.0,1274.0,4333.0,,,0.0,0.0,
75%,3254274000.0,,,,,,,4730.0,10600.0,12341.0,,,0.0,1.0,


In [7]:
print(df['post_type'].unique())
print((df['post_type']=='RETWEET').sum())
print((df['post_type']=='QUOTE_TWEET').sum())
print(df['post_type'].isnull().sum())

['RETWEET' nan 'QUOTE_TWEET']
1270702
40244
1662425


## Cleaning phase

In order to use the dataset, we first have to clean it. For this purpose, we will study, and select only certain row according to the following criteriae :
* First, we suppose that only the english tweet will be used. Indeed, as the disinformation is supposed to be done in USA, it is relevant to keep only the tweet that have been posted in english.
* Second, Donald Trump has been elected officially the 19th of december 2016. As the primary to these elections have begun on february the same year, it can be relevant to keep an interval of one year before the elections date.
* Thirdly, some tweets are posted in one sentence, delimited by capital letters. In this case, the one word sentences have to be separated to help the research in a easier manner. However, alle the tweets can be posted with capital letters, so, for easier computations, it is more relevant to cast them in only lowercase sentences.

In [21]:
# Keep only english tweets
english_df = df[df['language'].astype(str).str.contains('English')]
english_df.head()

Unnamed: 0,external_author_id,author,content,region,language,publish_date,harvested_date,following,followers,updates,post_type,account_type,new_june_2018,retweet,account_category
0,1674084000.0,GAB1ALDANA,People are too toxic. I think I have people po...,United States,English,7/30/2016 20:15,7/30/2016 20:15,3395,2014,2150,RETWEET,Hashtager,0,1,HashtagGamer
1,1674084000.0,GAB1ALDANA,#NowPlaying Don't Shoot (I'm a Man) by @DEVO -...,United States,English,7/30/2016 20:15,7/30/2016 20:15,3395,2014,2146,RETWEET,Hashtager,0,1,HashtagGamer
2,1674084000.0,GAB1ALDANA,the 'I'm the most boring person in the world' ...,United States,English,7/30/2016 20:16,7/30/2016 20:16,3395,2013,2159,RETWEET,Hashtager,0,1,HashtagGamer
4,1674084000.0,GAB1ALDANA,#MyAchillesHeel Boring narcissists.....nothing...,United States,English,7/30/2016 20:16,7/30/2016 20:16,3395,2013,2158,RETWEET,Hashtager,0,1,HashtagGamer
5,1674084000.0,GAB1ALDANA,Your opinion on Hillary really matters to a no...,United States,English,7/30/2016 20:16,7/30/2016 20:16,3395,2014,2154,RETWEET,Hashtager,0,1,HashtagGamer


In [57]:
# Keep only interesting dates
end_date = 20161108

english_df['publish_date'] = pd.to_datetime(english_df['publish_date'], format='%m/%d/%Y %H:%M')
english_df = english_df[english_df['publish_date'].dt.year*10000+english_df['publish_date'].dt.month*100+english_df['publish_date'].dt.day >= end_date-10000]
english_df = english_df[english_df['publish_date'].dt.year*10000+english_df['publish_date'].dt.month*100+english_df['publish_date'].dt.day >= end_date]

english_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,external_author_id,author,content,region,language,publish_date,harvested_date,following,followers,updates,post_type,account_type,new_june_2018,retweet,account_category
1179,7.49891e+17,GABRIELLAFALTZ,#democracy2017: The President. https://t.co/0G...,Unknown,English,2017-07-10 12:36:00,7/10/2017 12:36,1125,269,208,RETWEET,German,1,1,NonEnglish
1185,7.49891e+17,GABRIELLAFALTZ,Took a nap and woke up in the French countrysi...,Unknown,English,2017-07-10 18:37:00,7/10/2017 18:37,1125,269,213,RETWEET,German,1,1,NonEnglish
1188,7.49891e+17,GABRIELLAFALTZ,Jared Kushner pushed a hard line against Qatar...,Unknown,English,2017-07-10 21:39:00,7/10/2017 21:39,1126,270,218,RETWEET,German,1,1,NonEnglish
1190,7.49891e+17,GABRIELLAFALTZ,#Camden fire is on the doorstep of @AP London ...,Unknown,English,2017-07-10 03:32:00,7/10/2017 3:32,1126,269,203,RETWEET,German,1,1,NonEnglish
1191,7.49891e+17,GABRIELLAFALTZ,A little pressure from France Inc can do wonde...,Unknown,English,2017-07-10 09:35:00,7/10/2017 9:37,1125,269,205,RETWEET,German,1,1,NonEnglish


In [74]:
# Separate one word sentences and cast it to lowercase characters
#s = 'TheLongAndWindingRoad'
#sent = (' '.join(re.findall('[A-Z][a-z]*', s))).lower()
english_df['content'] = english_df.apply(lambda x : (' '.join(re.findall('[A-Z][a-z]*', x['content']))).lower(), axis=1)
english_df.head()

Unnamed: 0,external_author_id,author,content,region,language,publish_date,harvested_date,following,followers,updates,post_type,account_type,new_june_2018,retweet,account_category
1179,7.49891e+17,GABRIELLAFALTZ,the president g p j w y k m x,Unknown,English,2017-07-10 12:36:00,7/10/2017 12:36,1125,269,208,RETWEET,German,1,1,NonEnglish
1185,7.49891e+17,GABRIELLAFALTZ,took french k fs qiwbjd r,Unknown,English,2017-07-10 18:37:00,7/10/2017 18:37,1125,269,213,RETWEET,German,1,1,NonEnglish
1188,7.49891e+17,GABRIELLAFALTZ,jared kushner qatar qatari e w av ey,Unknown,English,2017-07-10 21:39:00,7/10/2017 21:39,1126,270,218,RETWEET,German,1,1,NonEnglish
1190,7.49891e+17,GABRIELLAFALTZ,camden a p london looks hawley arms hope,Unknown,English,2017-07-10 03:32:00,7/10/2017 3:32,1126,269,203,RETWEET,German,1,1,NonEnglish
1191,7.49891e+17,GABRIELLAFALTZ,a france inc macron b xam gs z reuters,Unknown,English,2017-07-10 09:35:00,7/10/2017 9:37,1125,269,205,RETWEET,German,1,1,NonEnglish


## Analysis part

### Scandale research

In this part, we will go through the english_df database which comports all the tweets we might need. Now, in order to proove that point, we first have to find if some scandale appeared on twitter, we can simply research it in the dataframe.
During the US elections, some scandale about pizzagate, Hillary's missing emails, Trump's bankruptcy, and others have been used to influence the elections. Although Facebook has a more leading roles during these elections, twitter is still a social media, and many russian accounts have been used to tweet about the elections.
In order to proove that point, we can look through the number of tweets that have began on russian territory.

In [89]:
start_tweet = english_df[english_df['post_type']=='QUOTE_TWEET']
region_group = start_tweet.groupby('region')
region_group.count().head()

# Evolution of russian tweets
#start_russia = start_tweet[start_tweet['region'] == 'Russian Federation']
#start_russia

Unnamed: 0_level_0,external_author_id,author,content,language,publish_date,harvested_date,following,followers,updates,post_type,account_type,new_june_2018,retweet,account_category
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Iraq,1,1,1,1,1,1,1,1,1,1,1,1,1,1
Russian Federation,18,18,18,18,18,18,18,18,18,18,18,18,18,18
Ukraine,2,2,2,2,2,2,2,2,2,2,2,2,2,2
United Kingdom,265,265,265,265,265,265,265,265,265,265,265,265,265,265
United States,8560,8560,8560,8560,8560,8560,8560,8560,8560,8560,8560,8560,8560,8560


In [108]:
keywords = 'hillary|emails'

#scandale_df = english_df[english_df['post_type']=='QUOTE_TWEET']
scandale_df = english_df[english_df['content'].astype(str).str.contains(keywords)]
scandale_df = scandale_df[scandale_df['post_type'] == 'QUOTE_TWEET']

scandale_group = scandale_df.groupby('region')
scandale_group.count()
#scandale_df.head()

Unnamed: 0_level_0,external_author_id,author,content,language,publish_date,harvested_date,following,followers,updates,post_type,account_type,new_june_2018,retweet,account_category
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
United States,208,208,208,208,208,208,208,208,208,208,208,208,208,208
Unknown,93,93,93,93,93,93,93,93,93,93,93,93,93,93


In [114]:
keywords = 'cambridge|analytic'

#scandale_df = english_df[english_df['post_type']=='QUOTE_TWEET']
scandale_df = english_df[english_df['content'].astype(str).str.contains(keywords)]
scandale_df = scandale_df[scandale_df['post_type'] == 'QUOTE_TWEET']

scandale_group = scandale_df.groupby('region')
scandale_group.count()
#scandale_df.head()

Unnamed: 0_level_0,external_author_id,author,content,language,publish_date,harvested_date,following,followers,updates,post_type,account_type,new_june_2018,retweet,account_category
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
United States,1,1,1,1,1,1,1,1,1,1,1,1,1,1


### Character of the tweets

When looking through the account category, we can see different categories :
* Non English
* Left and Right Troll
* Hashtag Gamer
* Newsfeed
* Fearmonger
* Commercial
* Unknown

The ones that might interest us the most, would be the 'non-english, and the left and right trolls. let see how much they are :

In [120]:
english_df_start_tweet = english_df[english_df['post_type'] == 'QUOTE_TWEET']
english_df_start_tweet['account_category'].unique()

array(['RightTroll', 'HashtagGamer', 'Unknown', 'LeftTroll', 'NonEnglish',
       'NewsFeed'], dtype=object)

In [122]:
account_category_group = english_df_start_tweet.groupby('account_category')
account_category_group.count()

Unnamed: 0_level_0,external_author_id,author,content,region,language,publish_date,harvested_date,following,followers,updates,post_type,account_type,new_june_2018,retweet
account_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
HashtagGamer,185,185,185,185,185,185,185,185,185,185,185,185,185,185
LeftTroll,467,467,467,467,467,467,467,467,467,467,467,467,467,467
NewsFeed,301,301,301,301,301,301,301,301,301,301,301,301,301,301
NonEnglish,6,6,6,6,6,6,6,6,6,6,6,6,6,6
RightTroll,15159,15159,15159,15159,15159,15159,15159,15159,15159,15159,15159,15159,15159,15159
Unknown,31,31,31,31,31,31,31,31,31,31,31,31,31,31


We can see here, that most of the tweet have started with a category corresponding to Right Troll (about 15000).