In [None]:
## import pandas as pd # Pandas is a library used for data manipulation and analysis, including reading and writing CSV files.
from nltk.stem import PorterStemmer # Stemming algorithm used to reduce words to their base or root form.
import re  # A module used for regular expression operations in Python.
from sklearn.feature_extraction.text import TfidfVectorizer  # A vectorizer used to transform text data into numerical feature vectors
import numpy as np
import pandas as pd

## Predict my self-made dataset
Here we are trying to use the trained svc model, trained with the data from the Sentiment/training.1600000.processed.noemoticon.csv to predict the labels of my self-made dataset. The main purpose here is to fill all the 30000 rows of my dataset in a faster way than manual labeling altough I will have to review the results later and check if the prediction is fine or not.

This first part will be just formating and cleaning the dataset

In [None]:
# The dataframe read is the same as CleanTranslateDataset/final_dataset_labeled.csv but I changed the name to be more accurate
df_toPredict = pd.read_csv('./final_dataset_partially_labeled.csv',delimiter=',', encoding='ISO-8859-1')

In [None]:
df_toPredict

In [None]:
df_toPredict.columns=['player', 'text', 'media', 'sentiment']

df_toPredict = df_toPredict.fillna(999)
df_toPredict = df_toPredict.replace([np.inf, -np.inf], -1)

In [None]:
df_toPredict.loc[df_toPredict['sentiment'] == 1, 'sentiment'] = 4

In [None]:
df_toPredict['sentiment'] = df_toPredict['sentiment'].astype(int)

In [None]:
df_toPredict

Now, as I am the one that created and labeled partially this dataset I know that I just labeled until the row 1420. After that row, there are the rows that we want to label using the svc model.

In [None]:
df_toPredict = df_toPredict.iloc[1420:]
df_toPredict = df_toPredict.reset_index(drop=True)

In [None]:
df_toPredict

Now I will just iterate over the dataset and use the functions defined above in order to label all the rows with a 1 or a 0. I won't label them with 0 or 4 because this task is done for being able to use the pre-trained BERT model in another notebook (BERT-fine-tuning-and-evaluation.ipynb), and this model needs 0's and 1's to work properly.

In [None]:
# Create the necessary functions to predict sentiment
tfidf = TfidfVectorizer()
with open('svc.pickle', 'rb') as handle:
    svc = pickle.load(handle)

In [None]:
stemmer = PorterStemmer()

"""
This function clean the passed text removing symbols and stemming it
"""
def clean_text(text):
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    text=re.sub("(\\d|\\W)+"," ",text)
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower().split()
    lem = WordNetLemmatizer()
    #text = [lem.lemmatize(word) for word in text if not word in stuff_to_be_removed]
    text = [stemmer.stem(word) for word in text if word not in set(stuff_to_be_removed)]
    text = ' '.join(text)

    return text

clean_text("I know you are really efficient")

In [None]:
"""
This function predicts the sentiment in a text
"""
def predict_sentiment_test(model, vectorizer, text):
    # Clean the text removing stopwords and using a lemmatizer
    text = clean_text(text)
    
    # Vectorize the text using the TfidfVectorizer
    text_vector = vectorizer.transform([text])
    
    # Make a prediction using the model
    prediction = model.predict(text_vector)
    
    # Return the predicted sentiment
    return prediction[0]

"""
This function predicts a dataset using the model and vectorizer passed by parameters and creates
a new dataframe with the sentiment row filled with the predicted sentiment
"""
def predict_dataset(model, vectorizer, df):
    # Create a dataframe that will contain the predictions
    pred_df = df.copy()
    
    for index, row in tqdm(df.iterrows(), total=len(df)):
        try:
            # Call to the function that predicts text
            predicted_sentiment = predict_sentiment_test(model, vectorizer, row['text'])

            # Set the prediction to our valid values
            if predicted_sentiment == 1:
                predicted_sentiment = 1
            else:
                predicted_sentiment = 0

            pred_df.at[index, 'sentiment'] = predicted_sentiment
        except:
            pred_df.at[index, 'sentiment'] = "ERROR"
            continue

        #print(f"The sentiment of the sentence '{text}' is {predicted_sentiment}")
    return pred_df

In [None]:
predicted_dataset = predict_dataset(svc, tfidf, df_toPredict)
predicted_dataset

In [None]:
# Remove the possible ERROR rows of the dataset labeled with the trained model
predicted_dataset = df[~(df == 'ERROR').any(axis=1)]
predicted_dataset

Save the progress as usual

In [None]:
#with open('myDataset_Predicted.pickle', 'wb') as handle:
#    pickle.dump(predicted_dataset, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
#predicted_dataset.to_csv("PredictCleanDataset/myDataset_Predicted.csv", index=False)