In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

In [26]:
final_file = pd.read_csv("../Cleaned_Datasets/uefa_final.csv")

In [27]:
sentiment_map = pd.read_csv("uefa_sentiment_dictionary.csv")
sentiment_dict = dict(zip(sentiment_map.words.values, sentiment_map.sentiment_coeff.values))

In [28]:
df_final = final_file.copy()
df_final.dropna()

Unnamed: 0.1,Unnamed: 0,tweets
0,0,play uefa youth leagu take away stage fright g...
1,1,happi birthday 1x world cup 2014 5x fifa world...
2,2,never uefa cupeuropa leagu cl 135 year history...
3,3,look belaru week
4,4,countri see huge rise case complet season isn’...
...,...,...
16794,16794,lol matter uefa tri help citi won’t win ucl
16795,16795,way citi get easi draw yet someon uefa nude
16796,16796,citi pay lifetim easi round 16 give uefa
16797,16797,date 16th round match 202021 uefa champion lea...


In [29]:
tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
tfidf.fit(df_final['tweets'].values.astype('U'))
features = pd.Series(tfidf.get_feature_names())
transformed = tfidf.transform(df_final['tweets'].values.astype('U'))

In [6]:
def create_tfidf_dictionary(x, transformed_file, features):    
    #create dictionary for each input sentence x, where each word has assigned its tfidf score
    
    vector_coo = transformed_file[x.name].tocoo()
    vector_coo.col = features.iloc[vector_coo.col].values
    dict_from_coo = dict(zip(vector_coo.col, vector_coo.data))
    return dict_from_coo

def replace_tfidf_words(x, transformed_file, features):
    #replacing each word with it's calculated tfidf dictionary with scores of each word

    dictionary = create_tfidf_dictionary(x, transformed_file, features)   
    return list(map(lambda y:dictionary[f'{y}'], str(x.tweets).split()))

In [7]:
def replace_sentiment_words(word, sentiment_dict):
    #replacing each word with its associated sentiment score from sentiment dict
    
    try:
        out = sentiment_dict[word]
    except KeyError:
        out = 0
    return out

In [30]:
replaced_tfidf_scores = df_final.apply(lambda x: replace_tfidf_words(x, transformed, features), axis=1)

In [31]:
replaced_closeness_scores = df_final.tweets.apply(lambda x: list(map(lambda y: replace_sentiment_words(y, sentiment_dict), str(x).split())))
replaced_closeness_scores.head()

0    [1.2154296966912717, -1.2300455199206448, 1.57...
1    [2.685032187726711, -1.6972360825258102, -1.45...
2    [-1.4282080963388366, -1.2300455199206448, -2....
3           [2.285192178215255, 0.0, 1.77720704392224]
4    [1.6149768427838032, 2.4274016474143862, 2.106...
Name: tweets, dtype: object

In [32]:
predictions_df = pd.DataFrame(data=[replaced_closeness_scores, replaced_tfidf_scores, df_final.tweets]).T
predictions_df.columns = ['sentiment_coeff', 'tfidf_scores', 'sentence']
predictions_df['sentiment_rate'] = predictions_df.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_scores']), axis=1)


In [33]:
a = predictions_df[predictions_df['sentiment_rate']>0]['sentiment_rate'].mean()
b = predictions_df[predictions_df['sentiment_rate']<0]['sentiment_rate'].mean()

In [34]:
for i in range(0,len(predictions_df.index)):
    if predictions_df.loc[i,'sentiment_rate']>a:
        predictions_df.loc[i,'prediction']=2
    elif predictions_df.loc[i,'sentiment_rate']<b:
        predictions_df.loc[i,'prediction']=0
    else:
        predictions_df.loc[i,'prediction']=1

In [35]:
x = (predictions_df[predictions_df['prediction']==0]['prediction'].count())
y = (predictions_df[predictions_df['prediction']==1]['prediction'].count())
z = (predictions_df[predictions_df['prediction']==2]['prediction'].count())
final = [(x/(x+y+z))*100,(y/(x+y+z))*100,(z/(x+y+z))*100]

In [20]:
#Covid_19 Sentiment Report
covid_outcomes = pd.DataFrame(final,columns=['percentage'])
covid_outcomes['labels'] = ['Negative','Neutral','Positive']
covid_outcomes

Unnamed: 0,percentage,labels
0,18.468964,Negative
1,64.290078,Neutral
2,17.240958,Positive


In [33]:
#BCCI Sentiment Report
bcci_outcomes = pd.DataFrame(final,columns=['percentage'])
bcci_outcomes['labels'] = ['Negative','Neutral','Positive']
bcci_outcomes

Unnamed: 0,percentage,labels
0,14.620793,Negative
1,67.665286,Neutral
2,17.71392,Positive


In [14]:
#BTS Sentiment Report
bts_outcomes = pd.DataFrame(final,columns=['percentage'])
bts_outcomes['labels'] = ['Negative','Neutral','Positive']
bts_outcomes

Unnamed: 0,percentage,labels
0,27.860126,Negative
1,69.872883,Neutral
2,2.266991,Positive


In [25]:
#REACTJS Sentiment Report
reactjs_outcomes = pd.DataFrame(final,columns=['percentage'])
reactjs_outcomes['labels'] = ['Negative','Neutral','Positive']
reactjs_outcomes

Unnamed: 0,percentage,labels
0,24.373618,Negative
1,59.676721,Neutral
2,15.949661,Positive


In [36]:
#UEFA Sentiment Report
uefa_outcomes = pd.DataFrame(final,columns=['percentage'])
uefa_outcomes['labels'] = ['Negative','Neutral','Positive']
uefa_outcomes

Unnamed: 0,percentage,labels
0,8.53027,Negative
1,65.795583,Neutral
2,25.674147,Positive
