In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [None]:
data = pd.read_json("Hi-En.json", orient = 'records', dtype={"article_link":int, "is_sarcastic":int,"headline":str})
print(data.head())

   article_link  is_sarcastic  \
0             0             0   
1             1             0   
2             2             1   
3             3             1   
4             4             1   

                                            headline  
0  ather farouqui general secretary of ghar empha...  
1  by passing of is started ji jaggo nahi to sama...  
2  swadu duniya geeta parjapat manjeetgill royal ...  
3  hurry up kahin ye offer miss na ho jaye p p p p p  
4  s logic hasne ke paise milte hai to alag alag ...  


In [None]:
data["is_sarcastic"] = data["is_sarcastic"].map({0: "Not Sarcasm", 1: "Sarcasm"})
print(data.head())

   article_link is_sarcastic  \
0             0  Not Sarcasm   
1             1  Not Sarcasm   
2             2      Sarcasm   
3             3      Sarcasm   
4             4      Sarcasm   

                                            headline  
0  ather farouqui general secretary of ghar empha...  
1  by passing of is started ji jaggo nahi to sama...  
2  swadu duniya geeta parjapat manjeetgill royal ...  
3  hurry up kahin ye offer miss na ho jaye p p p p p  
4  s logic hasne ke paise milte hai to alag alag ...  


In [None]:
data = data[["headline", "is_sarcastic"]]
x = np.array(data["headline"])
y = np.array(data["is_sarcastic"])

cv = CountVectorizer()
X = cv.fit_transform(x) # Fit the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

Bernoulli Naive Bayes Algorithm

In [None]:
from sklearn.naive_bayes import BernoulliNB
model = BernoulliNB()
model.fit(X_train, y_train)
# print(model.score(X_test, y_test))

In [None]:
user = input("Enter a Text: ")
data = cv.transform([user]).toarray()
# print(data)
output = model.predict(data)
print(output)

Enter a Text: Aapne apne aap ko einstein smjha hai kya
['Sarcasm']


In [None]:
preds = model.predict(X_test)
print('Accuracy: ')
print(accuracy_score(y_test, preds))

Accuracy: 
0.786695652173913


In [None]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

 Not Sarcasm       0.80      0.77      0.78     11414
     Sarcasm       0.78      0.81      0.79     11586

    accuracy                           0.79     23000
   macro avg       0.79      0.79      0.79     23000
weighted avg       0.79      0.79      0.79     23000



Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
logi_model = LogisticRegression(solver='liblinear', random_state=0)
logi_model.fit(X_train, y_train)
# print(logi_model.score(X_test, y_test))

In [None]:
user = input("Enter a Text: ")
data = cv.transform([user]).toarray()
# print(data)
output = logi_model.predict(data)
print(output)

Enter a Text: Very well done, shayad apne aapko Einstein samajh rakha hai.
['Sarcasm']


In [None]:
preds = logi_model.predict(X_test)
print('Accuracy: ')
print(accuracy_score(y_test, preds))

Accuracy: 
0.8601304347826086


In [None]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

 Not Sarcasm       0.86      0.85      0.86     11414
     Sarcasm       0.86      0.87      0.86     11586

    accuracy                           0.86     23000
   macro avg       0.86      0.86      0.86     23000
weighted avg       0.86      0.86      0.86     23000



Support Vector Machine

In [None]:
# Import required library

import numpy as np
from math import exp
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.metrics import classification_report

In [None]:
# Read in and split data

dataset = pd.read_csv('Hi-En train.csv')
x_1, x_2, y_1, y_2 = train_test_split(dataset['headline'], dataset['is_sarcastic'], test_size=0.20, random_state = 100)

In [None]:
# Vectorize dataset

vectorizer = TfidfVectorizer(min_df = 5, max_df = 0.8, sublinear_tf = True, use_idf = True)
X_1 = vectorizer.fit_transform(x_1)
X_2 = vectorizer.transform(x_2)

In [None]:
# Change split data into dataframes

x_1 = x_1[:, np.newaxis]
x_1 = pd.DataFrame(x_1)
x_1.columns = ['headline'] 

x_2 = x_2[:, np.newaxis]
x_2 = pd.DataFrame(x_2)
x_2.columns = ['headline']

y_1 = y_1[:, np.newaxis]
y_1 = pd.DataFrame(y_1)
y_1.columns = ['is_sarcastic']

y_2 = y_2[:, np.newaxis]
y_2 = pd.DataFrame(y_2)
y_2.columns = ['is_sarcastic']

  x_1 = x_1[:, np.newaxis]
  x_2 = x_2[:, np.newaxis]
  y_1 = y_1[:, np.newaxis]
  y_2 = y_2[:, np.newaxis]


In [None]:
# Fit SVM and predict on test data

classifier = svm.SVC(kernel='linear')
# classifier.fit(X_1, y_1['is_sarcastic'])
classifier.fit(X_1, y_1) 
prediction = classifier.predict(X_2)

  y = column_or_1d(y, warn=True)


In [None]:
# Fit SVM and predict on test data

print('Accuracy: ')
print (accuracy_score(y_2, prediction))

scores = classification_report(y_2['is_sarcastic'], prediction, output_dict=True)
print('positive: ', scores['1'])
print('negative: ', scores['0'])

Accuracy: 
0.8736521739130435
positive:  {'precision': 0.8932345768223287, 'recall': 0.8515456815637648, 'f1-score': 0.8718920825251278, 'support': 11613}
negative:  {'precision': 0.8554782462905525, 'recall': 0.8961974181083692, 'f1-score': 0.8753645565277063, 'support': 11387}


In [None]:
user = input("Enter a Text: ")
data = vectorizer.transform([user]).toarray()
output = classifier.predict(data)
print(output)

Enter a Text: Aapne apne aap ko einstein smjha hai kya
[1]
