In [1]:
import numpy as np
import pandas as pd
from IPython.display import display
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

In [25]:
final_file = pd.read_csv('CleanedData.csv').copy()

file_weighting = pd.DataFrame({'title': final_file["Cleaned_sentences"].copy()})

In [26]:
file_weighting

Unnamed: 0,title
0,volunteer dies sheep charges therapy farm kim ...
1,elusive truth behind attack french soccer star...
2,michael steinhardt billionaire surrenders mill...
3,instagram says parental controls arrive march ...
4,biden supreme court commission prepares vote f...
...,...
278,elizabeth holmes says former boyfriend abused ...
279,math equation tried stump internet published s...
280,cardiac angiosarcoma virgil abloh celebrated f...
281,parag agrawal twitter new c e longtime twitter...


In [27]:
sentiment_map = pd.read_csv('sentiment_dictionary.csv')
sentiment_dict = dict(zip(sentiment_map.words.values, sentiment_map.sentiment_coeff.values))

In [28]:
tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
tfidf.fit(file_weighting.title)
features = pd.Series(tfidf.get_feature_names())
transformed = tfidf.transform(file_weighting.title)



In [29]:
def create_tfidf_dictionary(x, transformed_file, features):
    '''
    create dictionary for each input sentence x, where each word has assigned its tfidf score
    
    inspired  by function from this wonderful article: 
    https://medium.com/analytics-vidhya/automated-keyword-extraction-from-articles-using-nlp-bfd864f41b34
    
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer

    '''
    vector_coo = transformed_file[x.name].tocoo()
    vector_coo.col = features.iloc[vector_coo.col].values
    dict_from_coo = dict(zip(vector_coo.col, vector_coo.data))
    return dict_from_coo

def replace_tfidf_words(x, transformed_file, features):
    '''
    replacing each word with it's calculated tfidf dictionary with scores of each word
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer
    '''
    dictionary = create_tfidf_dictionary(x, transformed_file, features)   
    return list(map(lambda y:dictionary[f'{y}'], x.title.split()))

In [30]:
replaced_tfidf_scores = file_weighting.apply(lambda x: replace_tfidf_words(x, transformed, features), axis=1)#this step takes around 3-4 minutes minutes to calculate

In [31]:
def replace_sentiment_words(word, sentiment_dict):
    '''
    replacing each word with its associated sentiment score from sentiment dict
    '''
    try:
        out = sentiment_dict[word]
    except KeyError:
        out = 0
    return out

In [32]:
replaced_closeness_scores = file_weighting.title.apply(lambda x: list(map(lambda y: replace_sentiment_words(y, sentiment_dict), x.split())))

In [36]:
replacement_df = pd.DataFrame(data=[replaced_closeness_scores, replaced_tfidf_scores, file_weighting.title]).T
replacement_df.columns = ['sentiment_coeff', 'tfidf_scores', 'sentence']
replacement_df['sentiment_rate'] = replacement_df.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_scores']), axis=1)
replacement_df['prediction'] = (replacement_df.sentiment_rate>0).astype('int8')

In [47]:
# Normalization of sentiment from -1 to 1
replacement_df['sentiment_rate'] = round(replacement_df['sentiment_rate']  / replacement_df['sentiment_rate'].abs().max(), 4)

In [48]:
replacement_df

Unnamed: 0,sentiment_coeff,tfidf_scores,sentence,sentiment_rate,prediction
0,"[0, 1.0081666286733688, 0, -0.994787010441699,...","[5.955827057601261, 4.857214768933151, 5.95582...",volunteer dies sheep charges therapy farm kim ...,-0.0120,0
1,"[0, 0, 0, -1.0086543665424563, 0, 0, -1.004287...","[5.955827057601261, 5.955827057601261, 5.95582...",elusive truth behind attack french soccer star...,-0.1391,0
2,"[0, 0, 0, 0, -1.0298857698935404, 0, 0, 0, 0, ...","[5.955827057601261, 5.955827057601261, 5.95582...",michael steinhardt billionaire surrenders mill...,0.0643,1
3,"[0, -1.0074632391552878, 0, 0, 0, 1.0086171207...","[5.955827057601261, 4.084024880699669, 5.55036...",instagram says parental controls arrive march ...,0.0671,1
4,"[1.022014578520568, 0, 0.9969401802188572, 0, ...","[4.251078965362836, 5.039536325727106, 8.50215...",biden supreme court commission prepares vote f...,0.1014,1
...,...,...,...,...,...
278,"[0, 0, -1.0074632391552878, -1.014056444462660...","[5.2626798770413155, 11.100723898986192, 4.084...",elizabeth holmes says former boyfriend abused ...,-0.1627,0
279,"[0, 0, 0, 0, 1.019840496886929, 0, 0, 0, 0, 1....","[5.955827057601261, 5.955827057601261, 5.95582...",math equation tried stump internet published s...,0.0330,1
280,"[0, 0, 0, 0, 0, 0, 0, -0.9902078445723124, 0, ...","[5.550361949493096, 5.955827057601261, 5.95582...",cardiac angiosarcoma virgil abloh celebrated f...,-0.0044,0
281,"[0, 0, -1.0093933629118257, -1.009614047895300...","[5.955827057601261, 11.911654115202522, 10.525...",parag agrawal twitter new c e longtime twitter...,-0.1442,0


In [49]:
replacement_df[['sentence', 'sentiment_rate', 'prediction']].to_csv('results.csv', index=False)

In [23]:
predicted_classes = replacement_df.prediction
y_test = replacement_df.sentiment

conf_matrix = pd.DataFrame(confusion_matrix(replacement_df.sentiment, replacement_df.prediction))
print('Confusion Matrix')
display(conf_matrix)

test_scores = accuracy_score(y_test,predicted_classes), precision_score(y_test, predicted_classes), recall_score(y_test, predicted_classes), f1_score(y_test, predicted_classes)

print('\n \n Scores')
scores = pd.DataFrame(data=[test_scores])
scores.columns = ['accuracy', 'precision', 'recall', 'f1']
scores = scores.T
scores.columns = ['scores']
display(scores)

Confusion Matrix


Unnamed: 0,0,1
0,170,113
1,0,0



 
 Scores


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,scores
accuracy,0.600707
precision,0.0
recall,0.0
f1,0.0
