# Assignment : Analyzing Twitter Data on 4th Industrial Revolution Technologies

We have a sample of 3000 tweets published by the newpapers or their authors over times. The data covers the period from 2007 to 2019.

### Variables in the code

• id: Unique identifier for each tweet
• created at: Time at which the tweet was posted
• text: The actual content of the tweet
• author.id: Unique identifier for each user
• author.name : The name of each Newspaper
• public metrics.like count: Number of likes
• public metrics.retweet count: Number of retweets
• label: Country code ISO-2

### Library use

In [None]:
import os
import pandas as pd
import re
import nltk
import spacy
from googletrans import Translator
from tqdm import tqdm
import time
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
import matplotlib.pyplot as plt
import seaborn as sns
from unidecode import unidecode

## Task

## Data Pre-processing

In [None]:
#Open the file
#dat = pd.read_csv('C:/Users/Marion/OneDrive/Documents/cours/strasbourg/M2/Machine learning/transform\Assignment\data_tweet_sample_challenge.csv')    
dat = pd.read_csv('C:/Users/epcmic/OneDrive/Documents/GitHub/Transformer/Challenge/data_tweet_sample_challenge.csv')

var = dat.loc[:,["id","created_at", "text", "author.id", "author.name", "author.public_metrics.followers_count","public_metrics.like_count","public_metrics.retweet_count","lang", "label"]]

###############################################################################

# Data Pre-processing
# Text cleaning on the tweet content.

def clean_text(text):
    text = unidecode(text)
    text = text.lower()
    stop_words = set(stopwords.words('english'))
    allowed_special_chars = ['@', '#']
    words = text.split()
    words = [word for word in words if word not in stop_words and not all(char in allowed_special_chars for char in word)]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    cleaned_text = ' '.join(words)
    return cleaned_text

var['text'] = var['text'].apply(clean_text)
print(var.text)

## Data Exploration

### Visualize data in graph

#### By Newspapers

In [None]:
# =============================================================================
# tweets Newspapers by year
# =============================================================================
#### group data by author.name and compute the number of id for every author
count_by_author = dat.groupby("author.name")["id"].count().reset_index()

# sort result by label and ascending
count_by_author_and_label = dat.groupby(["label", "author.name"])["id"].count().reset_index().sort_values(["label", "id"], ascending=[True, False])
top_5_by_label = count_by_author_and_label.groupby("label").head(5)
print(top_5_by_label)

#modelize with a plot
fig, ax = plt.subplots(figsize=(10,6))
for i, (label, grp) in enumerate(top_5_by_label.groupby('label')):
    ax.barh(y=grp['author.name'], width=grp['id'], color=plt.cm.tab10(i / len(top_5_by_label['label'].unique())), label=label)

plt.xlabel("Author")
plt.ylabel("Number of articles")
plt.title("Top 5 of author who tweet the most by country ")
plt.show()

#### The the distribution of tweets over time, by newspapers
data = dat[['author.name', 'created_at', 'public_metrics.retweet_count']]

# Count the number of tweets by newspaper
tweet_counts = data.groupby(['author.name']).agg({'public_metrics.retweet_count': 'sum'}).reset_index()

# Select the top 10 newspapers with the most tweets
top_newspapers = tweet_counts.nlargest(10, 'public_metrics.retweet_count')['author.name'].tolist()

# Filter the data to keep only the top 10 newspapers
data = data[data['author.name'].isin(top_newspapers)]

# Group the data by newspaper and tweet year, and aggregate the number of tweets
data_by_newspaper = data.groupby(['author.name', dat['created_at'].dt.year]).agg({'public_metrics.retweet_count': 'sum'}).reset_index()

# Pivot the data to have one column for each newspaper, and one row for each year
pivoted_data = data_by_newspaper.pivot(index='created_at', columns='author.name', values='public_metrics.retweet_count').fillna(0)

# Create a bar chart for each year
sns.set_style("whitegrid")
sns.set_palette(sns.color_palette("husl", len(top_newspapers)))
sns.set(rc={'figure.figsize':(12,8)})
pivoted_data.plot(kind='bar')
plt.xlabel('Year')
plt.ylabel('Number of tweets')
plt.title('Number of tweets per year for the top 10 newspapers')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()

# =============================================================================
# distribution of the 5 newspapers for each country by year
# =============================================================================
dat['created_at'] = pd.to_datetime(dat['created_at'])
dat['year'] = dat['created_at'].dt.year

count_by_author_and_label_and_year = dat.groupby(['year', 'author.name', 'label'])['id'].count().reset_index().sort_values(['label', 'year', 'id'], ascending=[True, True, False])

# Top 5 of authors by country over time
top_5_by_label_and_year = count_by_author_and_label_and_year.groupby(['label', 'year']).head(5)

# Graph specifications
for label, grp in top_5_by_label_and_year.groupby('label'):
    fig, ax = plt.subplots(figsize=(10,6))
    for i, (year, year_grp) in enumerate(grp.groupby('year')):
        ax.bar(year_grp['author.name'], year_grp['id'], color=plt.cm.tab10(i / len(grp['year'].unique())), label=year)
    plt.xlabel('Auteur')
    plt.ylabel("Nombre d'articles")
    plt.title(f"Top 5 of authors who tweets the most by country")
    plt.legend()
    plt.show()
  
##### Graph with the number of tweets by country by year 

var['year'] = pd.DatetimeIndex(var['created_at']).year

# Sort of tweets by country over time
tweets_by_country_year = var.groupby(['year', 'label']).size().reset_index(name='count')

# Graph specifications
sns.set_style("whitegrid")
sns.set_palette(sns.color_palette("pastel"))
sns.set(rc={'figure.figsize':(12,8)})
sns.barplot(x="year", y="count", hue="label", data=tweets_by_country_year,
            palette=sns.color_palette("husl", len(tweets_by_country_year['label'].unique())))
plt.xlabel('Year')
plt.ylabel("Number of tweets")
plt.title("Number of tweets by country by year")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()

#### Like by followers

In [None]:
# =============================================================================
# like by followers
# =============================================================================
# Compute the number of followers
var['likes_per_follower'] = var['public_metrics.like_count'] / var['author.public_metrics.followers_count']
top_journals_by_country = var.groupby(['label', 'author.name'])['likes_per_follower'].mean().groupby('label', group_keys=False).nlargest(5)
print(top_journals_by_country)

dat['author.name'].unique()
var['created_at'] = pd.to_datetime(var['created_at'])

# New colomn for the year and the month
var['year'] = var['created_at'].dt.year
var['month'] = var['created_at'].dt.month

# Merge DataFrames var et top_journals_by_country
merged_data = var.merge(top_journals_by_country, on=['author.name', 'label'])

# Graph specifications
plt.figure(figsize=(12, 6))
sns.barplot(x='year', y='public_metrics.like_count', hue='month', data=merged_data, ci=None, palette='Set3')
plt.xlabel('year')
plt.ylabel('Number of Like')
plt.title('Distribution of Likes byyear and month for the most important newspapers')
plt.xticks(rotation=45)
plt.show()



####like by newspapers by country by newspaper
data = dat[['public_metrics.like_count', 'author.name', 'label', 'created_at']]

# Compute the number of retweets by year, newpapers and countries
count_by_year_author_label = data.groupby(['label', 'author.name', dat['created_at'].dt.year])['public_metrics.like_count'].sum().reset_index()

# Select newspapers which tweet the most
top_author_per_label = count_by_year_author_label.loc[count_by_year_author_label.groupby('label')['public_metrics.retweet_count'].idxmax()]

#Merge data to select only the top newspapers
merged_data = pd.merge(data, top_author_per_label[['author.name', 'label']], on=['author.name', 'label'], how='inner')

# Graph specifications
sns.set_style("whitegrid")
sns.set_palette(sns.color_palette("husl", len(top_author_per_label['label'].unique())))
sns.set(rc={'figure.figsize':(12,8)})
sns.barplot(x=merged_data['created_at'].dt.year, y=merged_data['public_metrics.like_count'], hue=merged_data['author.name'], palette='husl', ci=None)
plt.xlabel('Année')
plt.ylabel('Nombre de retweets')
plt.title('Distribution des like par année pour chaque journal qui tweet le plus par pays')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()


####Number of like by year and country
data = dat[['public_metrics.like_count', 'author.name', 'label', 'created_at']]

# Compute number of retweets by year, newspapers and country
count_by_year_author_label = data.groupby(['label', 'author.name', dat['created_at'].dt.year])['public_metrics.like_count'].sum().reset_index()

# Select the newspapers which post the most by country
top_author_per_label = count_by_year_author_label.loc[count_by_year_author_label.groupby('label')['public_metrics.like_count'].idxmax()]

# Merge to keep only newspapers select
merged_data = pd.merge(data, top_author_per_label[['author.name', 'label']], on=['author.name', 'label'], how='inner')

# Graph specifications
sns.set_style("whitegrid")
sns.set_palette(sns.color_palette("husl", len(top_author_per_label['label'].unique())))
sns.set(rc={'figure.figsize':(12,8)})
sns.barplot(x=merged_data['created_at'].dt.year, y=merged_data['public_metrics.like_count'], hue=merged_data['author.name'], palette='husl', ci=None)
plt.xlabel('Année')
plt.ylabel('Nombre de retweets')
plt.title('Distribution des like par année pour chaque journal qui tweet le plus par pays')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()



#### Retweet

In [None]:
### Number of retweets by year and country
data = dat[['created_at', 'public_metrics.retweet_count', 'label']]
data['created_at'] = pd.to_datetime(data['created_at'])
data['year'] = data['created_at'].dt.year

# Compute the number of retweets by year and by country
count_by_year_label = data.groupby(['label', 'year'])['public_metrics.retweet_count'].sum().reset_index()

# Graph
sns.set_style("whitegrid")
sns.set_palette("husl")
sns.set(rc={'figure.figsize':(12,8)})
sns.barplot(x='year', y='public_metrics.retweet_count', hue='label', data=count_by_year_label)
plt.xlabel('Année')
plt.ylabel('Number of retweets')
plt.title('Number of retweets by year and by country')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()


### Number of retweet by year by newspaper in each country 
data = dat[['public_metrics.retweet_count', 'author.name', 'label', 'created_at']]

# Compute the number of retweets by year, newspapers and by country
count_by_year_author_label = data.groupby(['label', 'author.name', dat['created_at'].dt.year])['public_metrics.retweet_count'].sum().reset_index()

# Select newspapers which tweet the most
top_author_per_label = count_by_year_author_label.loc[count_by_year_author_label.groupby('label')['public_metrics.retweet_count'].idxmax()]

#Merge for newspapers select
merged_data = pd.merge(data, top_author_per_label[['author.name', 'label']], on=['author.name', 'label'], how='inner')

# Graph
sns.set_style("whitegrid")
sns.set_palette(sns.color_palette("husl", len(top_author_per_label['label'].unique())))
sns.set(rc={'figure.figsize':(12,8)})
sns.barplot(x=merged_data['created_at'].dt.year, y=merged_data['public_metrics.retweet_count'], hue=merged_data['author.name'], palette='husl', ci=None)
plt.xlabel('Année')
plt.ylabel('Nombre de retweets')
plt.title('Distribution des retweets par année pour chaque journal qui tweet le plus par pays')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()


#### Extract hashtags from the tweet text

In [None]:
var = pd.DataFrame(var)

# Function which extract the hashtags (form "#")
def extract_hashtags(text):
    hashtags = re.findall(r'#\w+', text)
    return hashtags

#Create a new column where we apply the function
var['hashtags'] = var['text'].apply(extract_hashtags)

print(var)

## Data Analysis