# Project 1 - Ilias Laadar - Mohamed Abdelaziz

In [1]:
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
import string
import re
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.feature_extraction.text import CountVectorizer

import pickle

In [2]:
train = pd.read_csv('tweets.csv')
train = train.drop(columns=['tweet_id', 'airline_sentiment_confidence', 'negativereason', 'negativereason_confidence', 'airline', 'airline_sentiment_gold', 'name', 'negativereason_gold', 'retweet_count', 'tweet_coord', 'tweet_created', 'tweet_location', 'user_timezone'])
train = train.rename(columns={"airline_sentiment": "sentiment"})

In [3]:
train.head()

Unnamed: 0,sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...


In [4]:
train.sentiment.value_counts()

negative    9178
neutral     3099
positive    2363
Name: sentiment, dtype: int64

In [5]:
def preprocessing(tweet):
    text = BeautifulSoup(tweet,).get_text()
    text = re.sub("[^a-zA-Z]", " ", text)
    words = text.lower().split()
    stopword = set(stopwords.words("english"))
    text = [w for w in words if not w in stopword]
    
    return(" ".join(text))

In [6]:
def clean_t(df):
    nb_tweets = df["text"].size
    clean_tweets = []

    for i in range(0, nb_tweets):                                                                
        clean_tweets.append(preprocessing(df["text"][i]))
        
    return clean_tweets

In [7]:
def split_text(train_features, df):
    X_train, X_test, y_train, y_test = train_test_split(train_features, df["sentiment"], test_size=0.2, random_state=0)
    
    return X_train, X_test, y_train, y_test

In [8]:
def model(X_train, y_train):
    rf_model = RandomForestClassifier(n_estimators = 150)
    rf_model = rf_model.fit(X_train, y_train)
    
    return rf_model

In [9]:
def predictions(rf_model, X_test):
    rf_predictions = rf_model.predict(X_test)
    
    return rf_predictions

In [10]:
def evaluation(y_test, rf_predictions):
    accuracy = accuracy_score(y_test, rf_predictions)
    f1score = f1_score(y_test, rf_predictions, average=None)
    precision = precision_score(y_test, rf_predictions, average=None)
    recall = recall_score(y_test, rf_predictions, average=None)
    
    return accuracy, f1score, precision, recall

In [11]:
clean_tweets = clean_t(train)

vectorizer = CountVectorizer(max_features = 6000) 
train_features = vectorizer.fit_transform(clean_tweets)
train_features = train_features.toarray()

(X_train, X_test, y_train, y_test) = split_text(train_features, train)

rf_model = model(X_train, y_train)

rf_predictions = predictions(rf_model, X_test)

(accuracy, f1score, precision, recall) = evaluation(y_test, rf_predictions)

print('Accuracy :', accuracy)
print('F1score :', f1score)
print('Precision :', precision)
print('Recall :', recall)

Accuracy : 0.7605874316939891
F1score : [0.8543439  0.53716216 0.63556116]
Precision : [0.82478845 0.55789474 0.72206304]
Recall : [0.88609626 0.51791531 0.56756757]


In [12]:
#pickle.dump(rf_model, open("model.pkl", "xb"), protocol=2)
#pickle.dump(vectorizer, open('vectorizer.pkl','xb'), protocol=2)

## Unit test

In [13]:
# Slean function test
clean_test = clean_t(train)
clean_test[:5]

['virginamerica dhepburn said',
 'virginamerica plus added commercials experience tacky',
 'virginamerica today must mean need take another trip',
 'virginamerica really aggressive blast obnoxious entertainment guests faces little recourse',
 'virginamerica really big bad thing']

In [14]:
# Split function test
(X_train, X_test, y_train, y_test) = split_text(train, train)
print(train.shape)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(14640, 2)
(11712, 2) (2928, 2) (11712,) (2928,)


In [15]:
# Predictions of the model
X_test.head()

Unnamed: 0,sentiment,text
13983,negative,@AmericanAir In car gng to DFW. Pulled over 1h...
14484,negative,"@AmericanAir after all, the plane didn’t land ..."
6403,negative,@SouthwestAir can't believe how many paying cu...
9653,negative,@USAirways I can legitimately say that I would...
13268,negative,@AmericanAir still no response from AA. great ...


## Integration test

In [16]:
def test_the_model(text, text2, text3):
    test = pd.DataFrame(columns = ['text'])
    test.loc[0, 'text'] = text
    test.loc[1, 'text'] = text2
    test.loc[2, 'text'] = text3
    
    clean_tweets = clean_t(test)

    test_features = vectorizer.transform(clean_tweets)
    test_features = test_features.toarray()

    rf_predictions = predictions(rf_model, test_features)

    test['pred'] = rf_predictions
    
    return test

In [17]:
text = 'It was a very good film'
text2 = 'This film is so bad, i hate it'
text3 = 'This message is neutral'

test = test_the_model(text, text2, text3)
test

Unnamed: 0,text,pred
0,It was a very good film,positive
1,"This film is so bad, i hate it",negative
2,This message is neutral,neutral
