# <font color='red'>1. Import Libraries and Check Data<font color='red'> 🧐

In [1]:
# Basic Libraries 📚
# --------------------------------------
import random
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


# NLP
# --------------------------------------
import nltk
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer


# Metrics 📐
# --------------------------------------
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate


# Machine Learning Models 🤖
# --------------------------------------
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier

**Loading the Dataset**

In [2]:
training_data = pd.read_csv('/Users/halfdeb/Downloads/Twitter Dataset/train.csv')
testing_data = pd.read_csv('/Users/halfdeb/Downloads/Twitter Dataset/test.csv')

In [3]:
training_data.columns

Index(['tweet_id', 'airline_sentiment', 'airline', 'airline_sentiment_gold',
       'name', 'negativereason_gold', 'retweet_count', 'text', 'tweet_coord',
       'tweet_created', 'tweet_location', 'user_timezone'],
      dtype='object')

In [4]:
print(training_data.shape)
print(testing_data.shape)

(10980, 12)
(3660, 11)


In [5]:
training_data.head(2)

Unnamed: 0,tweet_id,airline_sentiment,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,567900433542488064,negative,Southwest,,ColeyGirouard,,0,"@SouthwestAir I am scheduled for the morning, ...",,2015-02-17 20:16:29 -0800,Washington D.C.,Atlantic Time (Canada)
1,569989168903819264,positive,Southwest,,WalterFaddoul,,0,@SouthwestAir seeing your workers time in and ...,,2015-02-23 14:36:22 -0800,"Indianapolis, Indiana; USA",Central Time (US & Canada)


In [6]:
training_data.isnull().sum()

tweet_id                      0
airline_sentiment             0
airline                       0
airline_sentiment_gold    10949
name                          0
negativereason_gold       10956
retweet_count                 0
text                          0
tweet_coord               10204
tweet_created                 0
tweet_location             3550
user_timezone              3577
dtype: int64

**DATASET STORY**
- text : Does not contain null data. Every text begins with @ due to the characteristic of twitter.
- alirline_sentiment : Does not contain null data.

These are the two coloumns which we will use for sentiment analysis

# <font color='red'>2. Text Preprocessing<font color='red'> 📝

### ON TRAINING DATASET

**creating a array of tuple, in which we are going to store text and sentiment**
- in this format [(text, sentiment),(text, sentiment).......]

In [7]:
df_train = training_data[['text', 'airline_sentiment']]
training_values = df_train.values
documents = []
for i in range(len(training_values)):
    documents.append([word_tokenize(training_values[i][0]), training_values[i][1]])

**cleaning the document(removing stopwords, punctuations and lemmetization)**

In [8]:
# lets shuffle the data first
random.seed(2)
random.shuffle(documents)

In [9]:
# creating stops(will contain stopwords and punctuations)
stops = stopwords.words('english') + list(string.punctuation)

In [10]:
#fuction to convert format of pos received from post_tag to suitable lemmatizer format
def get_simple_pos(tag): #creating simple tags to pass into the lemmatizer
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [11]:
#functions for lemmatization
lemmatizer = WordNetLemmatizer()
def clean_tweet(words):
    output_words = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower()) # after all the work lowering all the words
    return output_words

In [12]:
# implementing all the changes in the document
documents = [(clean_tweet(document), category) for document, category in documents]

**Count vectorizer**

In [13]:
# creating seperates variables for target and features
y_train = [category for document, category in documents]
x_train = [" ".join(document) for document, category in documents] #joing the words as count vector accept data in a form of sentence

In [14]:
count_vec = CountVectorizer(max_features=2000)
x_train_features = count_vec.fit_transform(x_train)

### ON TESTING DATASET

**cleaning the document(removing stopwords, punctuations and lemmetization)**

In [15]:
testing_data = np.array(testing_data['text'])

In [16]:
x_test = []
for document in testing_data:
    document = clean_tweet(word_tokenize(document))
    x_test.append(" ".join(document))

**count vectorizer**

In [17]:
x_test_features = count_vec.transform(x_test)

# <font color='red'>3. Creating Model & Predicting💻<font color='red'>

**random forest**

In [18]:
random_forest = RandomForestClassifier()
random_forest.fit(x_train_features, y_train)

In [19]:
prediction_randomforest = random_forest.predict(x_test_features)

**MULTINOMIAL NAIVE BAYES**

In [20]:
NB = MultinomialNB()
NB.fit(x_train_features, y_train)

Grid Search

In [21]:
parameters = {  
'alpha': (1, 0.1, 0.01, 0.001, 0.0001, 0.00001)  
}  
grid_search= GridSearchCV(NB, parameters)
grid_search.fit(x_train_features, y_train)

In [22]:
prediction_NB = grid_search.predict(x_test_features)

**LOGISTIC REGRESSION**

In [23]:
logistic = LogisticRegression()
logistic.fit(x_train_features, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
prediction_logistic = logistic.predict(x_test_features)

**SVM**

In [25]:
support_vectors = svm.SVC()
grid = {'C' : [1,1e2, 1e3, 5e3, 1e4],
       'gamma' : [1e-3, 5e-4, 1e-4, 5e-3]}
svm_grid = GridSearchCV(support_vectors, grid)
svm_grid.fit(x_train_features, y_train)

In [26]:
prediction_vectors = svm_grid.predict(x_test_features)

**KNeighborsClassifier**

In [27]:
knn = KNeighborsClassifier()
grid = {"n_neighbors":[3,5,7,9,11]}
knn_grid = GridSearchCV(knn, grid)
knn_grid.fit(x_train_features, y_train)

 # <font color='red'> Prediction And Saving the Prediction <font color='red'>

In [28]:
prediction_knn = knn_grid.predict(x_test_features)

In [30]:
np.savetxt(fname="predictions.csv", X=prediction_knn, delimiter=',', fmt="%s")