In [10]:
import nltk
import pandas as pd
from nltk.corpus import stopwords, wordnet
import string

In [11]:
train_data = pd.read_csv('training_twitter_x_y_train.csv', usecols=['text', 'airline_sentiment'])
test_data = pd.read_csv('test_twitter_x_test.csv', usecols=['text'])

In [12]:
stop_words = stopwords.words('english')
print(len(stop_words))
stop_words += list(string.punctuation)

179


In [13]:
len(stop_words)

211

In [14]:
def get_simple_pos(tag): 
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [15]:
from nltk.stem import WordNetLemmatizer, PorterStemmer
lemmatizer = WordNetLemmatizer()
stem = PorterStemmer()
stem.stem('playing')

'play'

In [16]:
def split_words(word):
    l_word = word.split('@')
    return l_word[1]

In [39]:
def clean_words(words):
    output_words = []
    all_words = words.split()
    for word in all_words:
        if word.lower() not in stop_words:
            if word.startswith('@'):
                w = split_words(word)
#                 pos = nltk.pos_tag([word])[0][1]
#                 cleaned_word = lemmatizer.lemmatize(w, pos=get_simple_pos(pos))
                cleaned_word = stem.stem(w)
                output_words.append(cleaned_word.lower())
            else:
                pos = nltk.pos_tag([word])[0][1]
#                 cleaned_word = lemmatizer.lemmatize(word, pos=get_simple_pos(pos))
                cleaned_word = stem.stem(word)
                output_words.append(cleaned_word.lower())
    return output_words

In [18]:
train_sentiments = list(train_data.airline_sentiment)
train_text = list(train_data.text)
test_text = list(test_data.text)

In [40]:
cleaned_train_words = [(clean_words(word), sent) for word, sent in zip(train_text, train_sentiments)]
cleaned_test_words = [(clean_words(word)) for word in test_text]

In [20]:
len(cleaned_train_words), len(cleaned_test_words)

(10980, 3660)

In [21]:
all_words = []
for word in cleaned_train_words:
    all_words += word[0]

In [22]:
freq = nltk.FreqDist(all_words)
freq.most_common(2100)

[('unit', 3028),
 ('flight', 2924),
 ('usairway', 2189),
 ('americanair', 2178),
 ('southwestair', 1786),
 ('jetblu', 1559),
 ('get', 1206),
 ('thank', 1003),
 ('cancel', 792),
 ('hour', 682),
 ('custom', 632),
 ('delay', 557),
 ('servic', 539),
 ('call', 509),
 ('wait', 505),
 ('time', 502),
 ('need', 502),
 ("i'm", 498),
 ('help', 497),
 ('fli', 486),
 ('&amp;', 481),
 ('2', 457),
 ('hold', 438),
 ('still', 429),
 ('go', 427),
 ('us', 422),
 ('tri', 420),
 ('would', 406),
 ('plane', 404),
 ('bag', 401),
 ('one', 399),
 ("can't", 381),
 ('book', 368),
 ('virginamerica', 366),
 ('make', 357),
 ('got', 350),
 ('like', 337),
 ('pleas', 335),
 ('gate', 334),
 ('back', 325),
 ('late', 315),
 ('check', 315),
 ('miss', 311),
 ('airlin', 298),
 ('chang', 291),
 ('take', 287),
 ('flightl', 277),
 ('seat', 275),
 ('day', 272),
 ('agent', 267),
 ('know', 258),
 ('flight.', 257),
 ('u', 252),
 ('guy', 248),
 ('travel', 241),
 ('phone', 240),
 ('never', 235),
 ('say', 234),
 ('ticket', 233),
 ('wa

In [23]:
def get_features_list(all_words, s):
    features_list = []
    freq_dict = nltk.FreqDist(all_words)
    for i in freq_dict.most_common(s):
        features_list.append(i[0])
    return features_list
features = get_features_list(all_words, 2100)

In [24]:
len(features)

2100

In [25]:
def get_features_dict(words):
    n = 0
    current_features = {}
    word_set = set(words)
    for w in features:
        current_features[w] = w in word_set
        n += 1
    return current_features

In [26]:
train_features = [(get_features_dict(words), sent) for words, sent, in  cleaned_train_words] 

In [27]:
test_features = [(get_features_dict(words)) for words in  cleaned_test_words]

In [29]:
from nltk import NaiveBayesClassifier
clf = NaiveBayesClassifier.train(train)

In [134]:
Y = [sent for word, sent in cleaned_train_words]

In [135]:
X = [" ".join(word) for word, sent in cleaned_train_words]

In [131]:
X_test_or = [" ".join(word) for word in cleaned_test_words]

In [70]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split

In [104]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=76969)

In [136]:
vector = TfidfVectorizer(ngram_range=(1, 2))
X_train_vector = vector.fit_transform(X)
X_test_vector = vector.transform(X_test_or)

In [146]:
from sklearn.metrics import confusion_matrix
import numpy as np
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(n_jobs=2, C=0.9285714285714285, solver='saga')
clf.fit(X_train_vector, Y)
print(clf.score(X_train_vector, Y))
# print(clf.score(X_test_vector, Y))
Y_pred = clf.predict(X_test_vector)
# print(confusion_matrix(Y_test, Y_pred))

0.8890710382513661


In [141]:
Y_pred = clf.predict(X_test_vector)
np.savetxt('predicted.csv', Y_pred, fmt='%s')