# Sentiment Analysis of Tweets
This project collects tweets about soccer to analyze their sentiment, comparing results of Afinn and FastText.
Overview:
* Search, clean, and load the tweets
* Calculate sentimes using Afinn
* Train a FastText model 
* Find sentiments using FastText model

In [328]:
import requests
import csv
import json
import os

# Get tweets from Twitter API

In [351]:
def create_url(query):
    #query = "westhamunited -is:tweet"
    # Tweet fields are adjustable.
    # Options include:
    # attachments, author_id, context_annotations,
    # conversation_id, created_at, entities, geo, id,
    # in_reply_to_user_id, lang, non_public_metrics, organic_metrics,
    # possibly_sensitive, promoted_metrics, public_metrics, referenced_tweets,
    # source, text, and withheld
    tweet_fields = "tweet.fields=author_id"
    url = "https://api.twitter.com/2/tweets/search/recent?query={}&{}".format(
        query, tweet_fields
    )
    return url


def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers


def connect_to_endpoint(url, headers):
    response = requests.request("GET", url, headers=headers)
    print(response.status_code)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

In [1]:
# Twitter information
api_key = ''
secret_key =  ''
bearer_token =  ''

In [369]:
def clean_tweet(item):
    item = re.sub("\#[^\s]+"," ",item) # remove @mention and #hashtag
    item = re.sub("\@[^\s]+"," ",item) # remove @mention 
    item = re.sub("(\w+:\/\/\S+)", " ", item)# remove urls
    item = re.sub("\W", " ", item) # remove non alphanumeric
    item = re.sub("\s+", " ", item) # remove more than one space
    item =  item.lower() # lowercase 
    return item

In [414]:
tweets = []
search_terms = ['Sheffield United', 'premier league']

for search in search_terms:
    query = f"{search} lang: en and not -has:links -is:retweet"
    url = create_url(query)
    headers = create_headers(bearer_token)
    json_response = connect_to_endpoint(url, headers)
    print(json.dumps(json_response, indent=4, sort_keys=True))

    for item in json_response['data']:
        tweet = clean_tweet(item['text'])
        tweets.append(tweet)

200
{
    "data": [
        {
            "author_id": "1168742702890008576",
            "id": "1373421929915412483",
            "text": "@psad2 @russell_liburd @Jayb1970NUFC @gone_bruce @gordon_woodburn @TOONYWOONY11 @whitleysfinest @LennyWalker1 @a6tox @PottskyRob @JBDT @Ethcommon @kunoglessi I hope you are not including Sheffield United and West Bromwich Albion in your equation of us being the worst team in the league."
        },
        {
            "author_id": "1065097051623645185",
            "id": "1373420977841971204",
            "text": "@calcutler @Saudi49er Yes i support the owner and i support Sheffield United. but one question did you hear Chris wilder say that he was sacked ?how would you feel when you realize that he left the club himself!? Sheffield United is not just manger or players .Sheffield united is a History \u2694\ufe0f\u2694\ufe0f\u2764\ufe0f"
        },
        {
            "author_id": "1015198246204035072",
            "id": "1373395874420756488",
 

In [415]:
tweets = list(dict.fromkeys(tweets))
for tweet in tweets:
    print(tweet)

 i hope you are not including sheffield united and west bromwich albion in your equation of us being the worst team in the league 
 yes i support the owner and i support sheffield united but one question did you hear chris wilder say that he was sacked how would you feel when you realize that he left the club himself sheffield united is not just manger or players sheffield united is a history 
brentford sheffield united and brighton tonight are the worst collective 270 mins i ve watched as a fan and 2 wins in 20 football not even of a championship standard he has to go 
 without a doubt any other club would have sacked him after the sheffield united if not brentford match plenty of time left then as well as matches and i refuse to believe with the right manager that these players are not good enough should be safe as houses
 they are likely to survive because fulham are not taking their chance albion and sheffield united are already down imo 
 even if he does i fear it could be too lat

## Calculate the sentiment of each record (or token), from the data source you have chosen.
Identify the theme for twenty of them, and report them in a excel sheet file. Your result excel file should include three columns:
* Tweet text or news entry.
* Sentiment scores from one of the baseline in the class, e.g. AfiNN, NRC, Bing (3 points)
* and one state-of-the-art library which we did not explain in the class, e.g. FastText, BERT, Word2Vec and GloVe.(7 points)
* Theme, which is a keyword you have extracted from them. This means you should perform theme analysis “manually” and not with algorithm. (5 points)

## Sentiment Scores using Afinn

In [417]:
from afinn import Afinn 
import pandas as pd 
  
#instantiate afinn 
afn = Afinn() 
           
# if score is > 0, then positive, neutral is 0, negative is < 0
afinn_scores = [afn.score(tweet) for tweet in tweets] 
print(afinn_scores)

[0.0, 8.0, 5.0, -3.0, 3.0, -1.0, 3.0, -1.0, 2.0, 0.0, 2.0, 0.0, 3.0, 2.0, 4.0, 0.0, 4.0, 13.0, 1.0, 0.0]


## Sentiment Scores using FastText


In [107]:
import fasttext

### Import the data

In [229]:
training_data = pd.read_csv('betsentiment-EN-tweets-sentiment-players.csv')
del training_data['tweet_date_created']
del training_data['tweet_id']
del training_data['language']
del training_data['sentiment_score']

### Clean the data

In [230]:
import re

In [231]:
# cleanup the text of the tweets
training_data['tweet_text'] = training_data['tweet_text'].apply(clean_tweet)

# match format required for fasttext
training_data["tweet_text"] = "__label__" + training_data["sentiment"] + " " + training_data["tweet_text"]
training_data.drop(training_data.loc[training_data['sentiment']=='MIXED'].index, inplace=True)

In [233]:
print(training_data)

                                                tweet_text sentiment
0        __label__NEUTRAL i vote for tap below to vote ...   NEUTRAL
1        __label__NEUTRAL when is your first match didn...   NEUTRAL
2        __label__NEUTRAL  you never know what you had ...   NEUTRAL
3        __label__POSITIVE  you look good man i d appre...  POSITIVE
4        __label__NEUTRAL  expect loads of excuses afte...   NEUTRAL
...                                                    ...       ...
1931331     __label__NEGATIVE  your mum is the pig asshole  NEGATIVE
1931332  __label__NEUTRAL  just get simbas face on your...   NEUTRAL
1931333  __label__NEUTRAL  thats why i said a fit bale ...   NEUTRAL
1931334  __label__NEUTRAL i vote for tap below to vote ...   NEUTRAL
1931335  __label__NEUTRAL  why alli that s a bizarre su...   NEUTRAL

[1918690 rows x 2 columns]


### Resample for accuracy

In [235]:
import seaborn as sns
training_data['sentiment'].value_counts(normalize=False)

NEUTRAL     1309490
POSITIVE     499342
NEGATIVE     109858
Name: sentiment, dtype: int64

In [236]:
# downsample NEUTRAL 
neutral_data = training_data[training_data.sentiment.eq('NEUTRAL')]
neutral_data = neutral_data.sample(500000)
print(neutral_data.shape)

(500000, 2)


In [237]:
# upsample NEGATIVE
negative_data = training_data[training_data.sentiment.eq('NEGATIVE')]
negative_data = negative_data.sample(500000,replace=True)
print(negative_data.shape)

(500000, 2)


In [238]:
# upsample POSITIVE
positive_data = training_data[training_data.sentiment.eq('POSITIVE')]
positive_data = positive_data.sample(500000,replace=True)
print(positive_data.shape)

(500000, 2)


In [239]:
# combine the samples...
frames = [neutral_data, negative_data, positive_data] 
final_training_data = pd.concat(frames)
print(final_training_data.shape)

(1500000, 2)


### Re-Check Sampling

In [240]:
final_training_data['sentiment'].value_counts(normalize=False)

NEGATIVE    500000
POSITIVE    500000
NEUTRAL     500000
Name: sentiment, dtype: int64

In [241]:
# export to csv for fasttext
del final_training_data['sentiment'] # remove the sentiment column - no longer needed
training_data.to_csv('training_data.csv', index = False)

### Train the model

In [242]:
model = fasttext.train_supervised(input="training_data.csv")

# Skipgram model :
#model = fasttext.train_unsupervised('training_data.csv', model='skipgram')

In [243]:
# save the model
model.save_model("model_tweets.bin")

### Run some tests

In [391]:
model.predict("you are the worst")

(('__label__NEGATIVE',), array([0.893053]))

In [392]:
model.predict("I love you!")

(('__label__POSITIVE',), array([0.99710155]))

In [393]:
model.predict("You are just okay")

(('__label__NEUTRAL',), array([0.99997783]))

### Get sentiments for the news titles

In [421]:
fasttext_scores = [model.predict(tweet) for tweet in tweets] 

## Export to CSV

In [422]:
def fasttext_cleanup(item):
    sentiment_name = item[0] # get first element of the tuple which has the sentiment name
    sentiment_name = str(sentiment_name) # cast to string
    sentiment_name = sentiment_name.replace("__label__", "") # remove the label prefix
    sentiment_name = re.sub("\W"," ",sentiment_name) # remove any extra characters
    sentiment_confidence = str(item[1]) # get the sentiment confidence score
    out = f"{sentiment_name}| Confidence:{sentiment_confidence}"
    out = re.sub("\s+", " ", out) # remove more than one space
    return out
    
    
# create datafram for results 
df = pd.DataFrame() 
df['tweet'] =  tweets 
df['afinn_score'] = afinn_scores
df['fasttext_score'] = fasttext_scores
df['fasttext_score'] = df['fasttext_score'].apply(fasttext_cleanup)

print(df)

                                                tweet  afinn_score  \
0    i hope you are not including sheffield united...          0.0   
1    yes i support the owner and i support sheffie...          8.0   
2   brentford sheffield united and brighton tonigh...          5.0   
3    without a doubt any other club would have sac...         -3.0   
4    they are likely to survive because fulham are...          3.0   
5    even if he does i fear it could be too late a...         -1.0   
6   must win game against brighton this and we hav...          3.0   
7   this is the same as the sheffield united game ...         -1.0   
8    that s not why your team is 15points behind s...          2.0   
9    we take another 6 points out of that run we w...          0.0   
10   but still not to state the obvious and i know...          2.0   
11   2 up within 5 minutes 1 shot on target the wh...          0.0   
12   not every player develop at early age there i...          3.0   
13  tkachuk constant

In [423]:
# write results to CSV
df.to_csv("sentiment_output.csv")