In [173]:
# import imp lib's
import pandas as pd
import numpy as np
import re

# for stemming
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
  
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import  GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [174]:
df = pd.read_csv("sarcastic+NonSarcastic_DataSets.csv")

In [175]:
# df.head()
df.columns
df.drop(columns=["Unnamed: 0"],inplace=True) # remove the extra col

In [176]:
df["Text"]

0        Contraceptives (for example a condom/ the pill...
1        You not differentiating between the beaver and...
2        Why don't you just sum it up with "If you're n...
3        Oh, we don't have to - sorry, I wasn't clear e...
4          Sorry, sometimes sarcasm is quite hard to read.
                               ...                        
39246    @Zendaya I could see the makeup artists giving...
39247    @ZiggiWatkins11 Slvr... That's great name #NOT...
39248    @zoso4986 @Nero He is the fag we need but not ...
39249    Zuma sounding like Kanye West right now trying...
39250    @ZZUCRU @UWDawgPack So true. Students - stick ...
Name: Text, Length: 39251, dtype: object

In [177]:
def displayInformation():
    
    value_count = df['class'].value_counts()
    print("Value Counts:\n",value_count)
    
    print("*"*50)
    
    col_name = df.columns
    print("Column Name:\n",col_name)
    print("*"*50)
    
    # Null Value
    print(df.isnull().any(axis=0))
    print("*"*50)
    
    

In [178]:
displayInformation()

Value Counts:
 0    19772
1    19479
Name: class, dtype: int64
**************************************************
Column Name:
 Index(['Text', 'class'], dtype='object')
**************************************************
Text     False
class    False
dtype: bool
**************************************************


In [179]:
labels = df["class"] 

In [180]:
def pre_processing():
    
    # need to remove special character and symbol from the text columns
    df["Text"] = df["Text"].apply(lambda s : re.sub('[^a-zA-Z]',' ',s))
    
    ## getting features and labels
    features = df['Text']
    

    
    
    # stemming of features 
    ps = PorterStemmer()
    features = features.apply(lambda x: x.split())
    features = features.apply(lambda x: ' '.join([ps.stem(word) for word in x]))
    
    return features 

In [181]:
features = pre_processing()

In [182]:
features

0        contracept for exampl a condom the pill aren t...
1        you not differenti between the beaver and the ...
2        whi don t you just sum it up with if you re no...
3        oh we don t have to sorri i wasn t clear enoug...
4               sorri sometim sarcasm is quit hard to read
                               ...                        
39246    zendaya i could see the makeup artist give u a...
39247       ziggiwatkin slvr that s great name not sarcasm
39248    zoso nero he is the fag we need but not the fa...
39249    zuma sound like kany west right now tri to exp...
39250    zzucru uwdawgpack so true student stick around...
Name: Text, Length: 39251, dtype: object

In [183]:
# Vectorizing the data to maximum 5000 features

tfidf = TfidfVectorizer(max_features=5000)
features = list(features)
features = tfidf.fit_transform(features).toarray()


features

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [184]:
# getting training and testing data

X_train,X_test ,y_train,y_test = train_test_split(
                                features,
                                 labels,
                                 test_size=0.2,
                                 train_size=0.8)



In [185]:
# now our data is ready to going to pipeline of model's
# model 1  :-

# using linear Suppport Vector classifier
lsvm = LinearSVC()
lsvm.fit(X_train,y_train)
# getting score of train and test model
print("*"*10,"Linear Support Vector Machine","*"*10)
print("Score on Train Data :",lsvm.score(X_train,y_train))
print("Score on Test Data :",lsvm.score(X_test,y_test))

# model 2 :-
gnb = GaussianNB()
gnb.fit(X_train,y_train)

# getting score of train and test model
print("*"*10,"Gussain Naive Biase","*"*10)
print("Score on Train Data :",gnb.score(X_train,y_train))
print("Score on Test Data :",gnb.score(X_test,y_test))

# logistic Regression
lr = LogisticRegression()
lr.fit(X_train,y_train)
# getting score of train and test model
print("*"*10,"Logistic Regression","*"*10)
print("Score on Train Data :",lr.score(X_train,y_train))
print("Score on Test Data :",lr.score(X_test,y_test))

rfc = RandomForestClassifier(n_estimators=10,random_state=0)
rfc.fit(X_train,y_train)
# getting score of train and test model
print("*"*10,"RandomForestClassifier","*"*10)
print("Score on Train Data :",rfc.score(X_train,y_train))
print("Score on Test Data :",rfc.score(X_test,y_test))


********** Linear Support Vector Machine **********
Score on Train Data : 0.9755732484076434
Score on Test Data : 0.9737613042924468
********** Gussain Naive Biase **********
Score on Train Data : 0.8273248407643312
Score on Test Data : 0.8072856960896702
********** Logistic Regression **********
Score on Train Data : 0.9743312101910828
Score on Test Data : 0.9737613042924468
********** RandomForestClassifier **********
Score on Train Data : 0.988407643312102
Score on Test Data : 0.9622977964590498
