In [1]:
#importing the libraries 
from collections import Counter
from nltk.corpus import stopwords
from numpy import array
import pandas as pd
import numpy as np
import preprocessor as p
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer

In [2]:
#Reading the data
data = pd.read_csv('train.txt', sep='\t')

# Pre-Processing steps

In [3]:
#function to clean the tweets 
def clean_docs(tweet_list):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english')) 
    p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION, p.OPT.RESERVED, p.OPT.SMILEY)
    cleaned_text = []
    for i in tweet_list:
        cleaned_text.append(p.clean(i))
        
    stop_word_cleared = []  
    for j in cleaned_text:
        word_tokens = j.split()
        filtered_sentence = ''

        for w in word_tokens: 
            if w not in stop_words: 
                w = lemmatizer.lemmatize(w)
                filtered_sentence = filtered_sentence + ' ' +w.lower()
        stop_word_cleared.append(filtered_sentence)
    return stop_word_cleared

In [4]:
#Seperating into irony and non irony 
irony = []
non_irony = []

for i in range(len(data)):
    if data['Label'][i] == 1:
        irony.append(data['Tweet text'][i])
    else:
        non_irony.append(data['Tweet text'][i])

In [5]:
#seperating into test and training 
irony_train = irony[:-382]
irony_test = irony[-382:]
non_irony_train = non_irony[:-382]
non_irony_test = non_irony[-382:]

In [6]:
#creation of test and training 
Xtrain = irony_train + non_irony_train
Xtest = irony_test + non_irony_test

In [7]:
#creating test and training labels 
ytrain = array([1 for _ in range(1519)] + [0 for _ in range(1534)])
ytest = array([1 for _ in range(382)] + [0 for _ in range(382)])

In [8]:
Xtrain_cleaned = clean_docs(Xtrain)
Xtest_cleaned = clean_docs(Xtest)

In [14]:
#creating a tfidf vector
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
# TF-IDF feature matrix
train_tfidf = tfidf_vectorizer.fit_transform(Xtrain_cleaned)

In [13]:
test_tfifd = tfidf_vectorizer.transform(Xtest_cleaned)

# Building the model

In [15]:
#Logistic regression model using BOW as feature 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

lreg = LogisticRegression()
lreg.fit(train_tfidf, ytrain) # training the model

prediction = lreg.predict_proba(test_tfifd) # predicting on the validation set
prediction_int = prediction[:,1] >= 0.3 # if prediction is greater than or equal to 0.3 than 1 else 0
prediction_int = prediction_int.astype(np.int)

f1_score(ytest, prediction_int) # calculating f1 score

0.7257203842049094

# Getting the prediction

In [16]:
#Reading the test data
test_data = pd.read_csv('test.csv')

In [17]:
#Cleaning the test data
testset_cleaned = clean_docs(test_data['Tweet text'])

In [19]:
tfidf_test = tfidf_vectorizer.transform(testset_cleaned)
test_pred = lreg.predict_proba(tfidf_test)
test_pred_int = test_pred[:,1] >= 0.3
test_pred_int = test_pred_int.astype(np.int)
test_data['Label'] = test_pred_int