In [1]:
import pandas as pd
import numpy as np

## Load the data

In [2]:
raw_data = pd.read_csv("Tweets.csv")
raw_data.describe()

Unnamed: 0,textID,text,selected_text,sentiment
count,27481,27480,27480,27481
unique,27481,27480,22463,3
top,cb774db0d1,"I`d have responded, if I were going",good,neutral
freq,1,1,199,11118


In [3]:
# Deleting the nan entries
raw_data = raw_data.dropna()
raw_data.describe()

Unnamed: 0,textID,text,selected_text,sentiment
count,27480,27480,27480,27480
unique,27480,27480,22463,3
top,cb774db0d1,"I`d have responded, if I were going",good,neutral
freq,1,1,199,11117


In [4]:
# Deleting two columns that are not relevants to predict the sentiment
raw_data = raw_data.drop(['textID'], axis=1)
# Selected_text could be confuse, text is enough for analysis but we are gonna storage it for a further analysis
selected_text = raw_data[['selected_text', "sentiment"]]
raw_data = raw_data.drop(['selected_text'], axis=1)
raw_data

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative
...,...,...
27476,wish we could come see u on Denver husband l...,negative
27477,I`ve wondered about rake to. The client has ...,negative
27478,Yay good for both of you. Enjoy the break - y...,positive
27479,But it was worth it ****.,positive


## Balancing the dataset

We should have the same number of inputs for each sentiment, this would ensure that our model criteria is no longer influenced by any sentiment, as well as splitting the dataframe equally for each sentiment

In [5]:
raw_data['sentiment'].value_counts()

neutral     11117
positive     8582
negative     7781
Name: sentiment, dtype: int64

In [6]:
# We should use only 7781 values for every sentiment because that's the number we have on minimum count
data = raw_data.groupby('sentiment').apply(lambda x: x.sample(7781)).reset_index(drop=True)
data['sentiment'].value_counts()

negative    7781
neutral     7781
positive    7781
Name: sentiment, dtype: int64

In [7]:
data

Unnamed: 0,text,sentiment
0,- It is supposed to be an unrelated (story-wi...,negative
1,homework on a friday night...lame,negative
2,gutted - the handbag I wanted has been sold!,negative
3,i miss 'mr.',negative
4,me too i hate revision,negative
...,...,...
23338,My gut says to replace $$$ appliances instead ...,positive
23339,aaawww no worries fresh start to work on gro...,positive
23340,"weird as usual, but ok... that`s why we like it",positive
23341,"????? ,my latest obession.",positive


## Exporting the data preprocessed into a csv file

In [8]:
data.to_csv('Tweets_preprocessed.csv', index=False)

## Words most used in tweets by category

The search is made for the most used words and their frequencies to observe the criteria that our algorithm took into account.

For that reason we save the selected_text column

In [9]:
from collections import Counter
import re

# Create an empty dataframe
words_most_used = pd.DataFrame()
words = []
frequencies = []
sentiments = []

# Cast the column into str
selected_text['selected_text'] = selected_text['selected_text'].astype(str)

for i in selected_text['sentiment'].unique():

    # Select only the entries that have an specific label in the column "sentiment"
    subset_data = selected_text[selected_text['sentiment'] == i]

    # Extract the text from the 'selected_text' column as a list of text strings
    text_list = subset_data['selected_text'].tolist()

    # Converts the text list to a single text string
    text_string = ' '.join(text_list)

    # Divide the text string into words and count their frequency
    # It only takes into account words with more than 4 letters to avoid connectors
    word_freq = Counter(word.lower() for word in re.findall(r'\b\w{4,}\b', text_string))
    
    # Prints the most used words and their frequency
    for word, freq in word_freq.most_common(5):
        words.append(word)
        frequencies.append(freq)
        sentiments.append(i)
        
words_most_used["Word"] = words
words_most_used["Frequency"] = frequencies
words_most_used["Sentiment"] = sentiments
words_most_used

Unnamed: 0,Word,Frequency,Sentiment
0,that,1040,neutral
1,have,996,neutral
2,just,898,neutral
3,with,818,neutral
4,this,576,neutral
5,miss,359,negative
6,sorry,300,negative
7,that,286,negative
8,hate,230,negative
9,have,171,negative


## Exporting this dataframe into a csv file

In [10]:
words_most_used.to_csv('Words_most_used.csv', index=False)