In [1]:
import pandas as pd
import numpy as np

In [2]:
train_data = pd.read_csv(r"C:\Users\33752\Downloads\Train_Dataset.csv")
test_data = pd.read_csv(r"C:\Users\33752\Downloads\Test_Dataset.csv")

In [3]:
train_data.head()

Unnamed: 0,headline,is_sarcastic
0,supreme court votes 7-2 to legalize all worldl...,1
1,hungover man horrified to learn he made dozens...,1
2,emily's list founder: women are the 'problem s...,0
3,send your kids back to school with confidence,0
4,watch: experts talk pesticides and health,0


In [4]:
train_data["is_sarcastic"].value_counts()

0    23958
1    20304
Name: is_sarcastic, dtype: int64

almost balanced training dataset

## Data preprocessing

removing contractions: emily's -> emily is

In [5]:
import contractions
train_data["headline"] = train_data["headline"].apply(contractions.fix)

test_data["headline"] = test_data["headline"].apply(contractions.fix)

Removing Special Characters and Symbols

In [6]:
import re

def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    text = re.sub(pattern, ' ', text)
    return text

In [7]:
train_data["headline"] = train_data["headline"].apply(remove_special_characters)

test_data["headline"] = test_data["headline"].apply(remove_special_characters)

In [8]:
train_data.head()

Unnamed: 0,headline,is_sarcastic
0,supreme court votes 7 2 to legalize all worldl...,1
1,hungover man horrified to learn he made dozens...,1
2,emily s list founder women are the problem s...,0
3,send your kids back to school with confidence,0
4,watch experts talk pesticides and health,0


## Building the model

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_val, Y_train, Y_val = train_test_split(train_data[["headline"]], train_data["is_sarcastic"], test_size=0.1, shuffle= False)

----------------------------------------------------------------------------------------------------------------------------------------------------------
#### Simple model using ML

In [11]:
import textblob

In [12]:
import string

X_train['char_count'] = X_train['headline'].apply(len)
X_train['word_count'] = X_train['headline'].apply(lambda x: len(x.split()))
X_train['word_density'] = X_train['char_count'] / (X_train['word_count']+1)
x_train_snt_obj = X_train['headline'].apply(lambda row: textblob.TextBlob(row).sentiment)
X_train['Polarity'] = [obj.polarity for obj in x_train_snt_obj.values]
X_train['Subjectivity'] = [obj.subjectivity for obj in x_train_snt_obj.values]



X_val['char_count'] = X_val['headline'].apply(len)
X_val['word_count'] = X_val['headline'].apply(lambda x: len(x.split()))
X_val['word_density'] = X_val['char_count'] / (X_val['word_count']+1)
X_val_snt_obj = X_val['headline'].apply(lambda row: textblob.TextBlob(row).sentiment)
X_val['Polarity'] = [obj.polarity for obj in X_val_snt_obj.values]
X_val['Subjectivity'] = [obj.subjectivity for obj in X_val_snt_obj.values]

In [13]:
X_train.head()

Unnamed: 0,headline,char_count,word_count,word_density,Polarity,Subjectivity
0,supreme court votes 7 2 to legalize all worldl...,53,10,4.818182,0.0,0.0
1,hungover man horrified to learn he made dozens...,66,12,5.076923,0.0,0.066667
2,emily s list founder women are the problem s...,65,11,5.416667,0.0,0.0
3,send your kids back to school with confidence,45,8,5.0,0.0,0.0
4,watch experts talk pesticides and health,41,6,5.857143,0.0,0.0


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)
tv_matrix = tv.fit_transform(X_train["headline"])
tv_matrix = tv_matrix.toarray()

vocab = tv.get_feature_names()
X_traintfidf = pd.DataFrame(tv_matrix, columns=vocab)

X_valtfidf = pd.DataFrame(tv.transform(X_val["headline"]).toarray(), columns=vocab)
X_valtfidf.index = [i+39835 for i in range(len(X_val))]

X_traintfidf.head()

Unnamed: 0,00,000,00000000001,00003,000th,025,03,047,071,10,...,zoo,zookeeper,zoolander,zoologists,zoomed,zoroastrianism,zsa,zucker,zuckerberg,zz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
X_train_comb = pd.concat([X_train.drop("headline", axis=1), X_traintfidf], axis=1)
X_val_comb = pd.concat([X_val.drop("headline", axis=1), X_valtfidf], axis=1)

X_train_comb.head()

Unnamed: 0,char_count,word_count,word_density,Polarity,Subjectivity,00,000,00000000001,00003,000th,...,zoo,zookeeper,zoolander,zoologists,zoomed,zoroastrianism,zsa,zucker,zuckerberg,zz
0,53,10,4.818182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,66,12,5.076923,0.0,0.066667,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,65,11,5.416667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,45,8,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,41,6,5.857143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

lr = LogisticRegression(C=1, random_state=42, solver='liblinear')
lr.fit(X_train_comb, Y_train)

LogisticRegression(C=1, random_state=42, solver='liblinear')

In [18]:
predictions = lr.predict(X_val_comb)

print(classification_report(Y_val, predictions))
pd.DataFrame(confusion_matrix(Y_val, predictions))

              precision    recall  f1-score   support

           0       0.90      0.89      0.90      2339
           1       0.88      0.89      0.88      2088

    accuracy                           0.89      4427
   macro avg       0.89      0.89      0.89      4427
weighted avg       0.89      0.89      0.89      4427



Unnamed: 0,0,1
0,2089,250
1,236,1852


In [22]:
X_test = test_data
X_test['char_count'] = X_test['headline'].apply(len)
X_test['word_count'] = X_test['headline'].apply(lambda x: len(x.split()))
X_test['word_density'] = X_test['char_count'] / (X_test['word_count']+1)
X_test_snt_obj = X_test['headline'].apply(lambda row: textblob.TextBlob(row).sentiment)
X_test['Polarity'] = [obj.polarity for obj in X_test_snt_obj.values]
X_test['Subjectivity'] = [obj.subjectivity for obj in X_test_snt_obj.values]

X_testtfidf = pd.DataFrame(tv.transform(X_test["headline"]).toarray(), columns=vocab)

X_test_comb = pd.concat([X_test.drop("headline", axis=1), X_testtfidf], axis=1)


result = lr.predict(X_test_comb)

In [25]:
#save results localy
res = pd.DataFrame(result)

res.columns = ['prediction']
res.to_csv("prediction_results.csv", index = False) 