In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
# Download NLTK resources (do it once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jayen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jayen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jayen\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
file=r"C:\Users\jayen\Text-sentiment-analysis-general-purpose\artifacts\sarcasm_dataset_updated.csv"
n=100000
df=pd.read_csv(file)

In [8]:
df

Unnamed: 0,text,label
0,thirtysomething scientists unveil doomsday clo...,1
1,dem rep. totally nails why congress is falling...,0
2,eat your veggies: 9 deliciously different recipes,0
3,inclement weather prevents liar from getting t...,1
4,mother comes pretty close to using word 'strea...,1
...,...,...
1066094,I'm sure that Iran and N. Korea have the techn...,1
1066095,"whatever you do, don't vote green!",1
1066096,Perhaps this is an atheist conspiracy to make ...,1
1066097,The Slavs got their own country - it is called...,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    100000 non-null  object
 1   label   100000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.5+ MB


In [9]:
df['label'].value_counts()

label
0    535373
1    530726
Name: count, dtype: int64

In [5]:
pd.api.types.is_numeric_dtype(df['is_sarcastic'])

True

In [4]:

import re

#NOW ILL DO SOME PREPROCESSING STEPS FOR THAT ILL CREATE A FUNCTION
def preprocessing(q):

  q=str(q).lower().strip()#strip gives a string in return if u use split it will give a list of words
  #The strip() method in Python is used to remove leading and trailing characters (whitespace characters by default) from a string before " Hello, world! " after"Hello, world!"
   # Replace certain special characters with their string equivalents
  q = q.replace('%', 'percent')
  q = q.replace('$', 'dollar ')
  q = q.replace('₹', 'rupee ')
  q = q.replace('€', 'euro ')
  q = q.replace('@', 'at')
  q=q.replace('&','and')

  q = re.sub(r'([0-9]+)000000000', r'\1b', re.sub(r'([0-9]+)000000', r'\1m', re.sub(r'([0-9]+)000', r'\1k', q)))#Input: "1500000000 Output: "1.5b" it will do this
  q=re.sub(r'\b(?:\d{1,4}[/-]\d{1,2}[/-]\d{1,4}|\d{1,2}[/-]\d{1,2}[/-]\d{2,4})\b','',q)#for removing dates
  q=re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b','',q)#removing email
  q= re.sub(r'https?://\S+|www\.\S+','',q)#removing urls
  q=re.sub("[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF"
                             "\U0001F700-\U0001F77F\U0001F780-\U0001F7FF\U0001F800-\U0001F8FF"
                             "\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF"
                             "\U00002702-\U000027B0\U000024C2-\U0001F251]+", '', q)



    # https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions
    # https://stackoverflow.com/a/19794953
  contractions = {
    "ain't": "am not",
    "aren't": "are not",
    "can't": "can not",
    "can't've": "can not have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
    }
  q_decontracted=[]

  q_decontracted = [contractions[i] if i in contractions else i for i in q.split()]


  #now i join this word into q
  q=' '.join(q_decontracted)
  q = q.replace("'ve", " have")
  q = q.replace("n't", " not")
  q = q.replace("'re", " are")
  q = q.replace("'ll", " will")
  # Removing HTML tags
  q = BeautifulSoup(q)
  q = q.get_text()
  # Remove punctuations like ,.""[]
  q =re.sub(r'[^\w\s{}\/<>\'"\-_=+()*~:;]', ' ', q)

  q = word_tokenize(q)  # Convert to lowercase

  # Remove stopwords and non-alphabetic characters, perform lemmatization
  q = [
        lemmatizer.lemmatize(word) for word in q
        if word not in stop_words
    ]



  return ' '.join(q)#ALWAYS REMEMBER IN ORDER TO USE WORD2VEC UR SENTENSES SHOULD BE IN TOKKENS OR WORDS BUT IN THIS CASE WE HAVE TO TRAIN OUR MODEL WE HAVE TO TOOKENIZE
  #THE WORDS SO WE USE JOIN THEN AFTER WE TOKKENIZE IT WONT BE ABLE TO CONCATINATE LIST OF STRINGS IMPORTANT. contain lists instead of strings.


In [7]:
df['preprocessed']=df['headline'].apply(preprocessing)

  q = BeautifulSoup(q)


here using neattext

In [5]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import neattext as nt
!pip install contractions
import contractions

def clean_text(text):
    # Initialize NeatText TextFrame and NLTK WordNetLemmatizer
    # Handle contractions
    text = contractions.fix(text)
    docx = nt.TextFrame(text)
    

    # Clean text using NeatText
    cleaned_text = docx.remove_puncts()
    cleaned_text = cleaned_text.remove_special_characters()
    cleaned_text = cleaned_text.remove_userhandles()
    cleaned_text = cleaned_text.remove_urls()
    cleaned_text = cleaned_text.remove_dates()
    cleaned_text = cleaned_text.remove_emails()
    cleaned_text = cleaned_text.remove_emojis()

    # Convert text to lowercase
    cleaned_text = str(cleaned_text).lower()

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    cleaned_text = " ".join(word for word in cleaned_text.split() if word not in stop_words)

    # Lemmatize words
    cleaned_text = " ".join(lemmatizer.lemmatize(word) for word in cleaned_text.split())

    return cleaned_text



In [8]:
df['preprocessed2']=df['text'].apply(clean_text)

In [5]:
df=df.dropna()

In [7]:
df.isnull().sum()

text     0
label    0
dtype: int64

In [6]:
df.sample(10)

Unnamed: 0,text,label
53096,cambodian paper takes parting shot at 'dictato...,0
51615,yoga for the heart,0
67144,Johnny Cash is still alive?,0
50112,loyal driveway patiently waiting for owner to ...,1
23359,toddler shits her way through 3rd halloween co...,1
56953,Where's the Chicago version?,0
36805,this teacher has watched 3 deadly attacks from...,0
86759,"I'm in the party of Zamasu tanking it, being m...",0
85001,Surely there's no Western country where that w...,1
77579,Cuz they obviously make awesome decisions,1


In [16]:
from sklearn.feature_extraction.text import CountVectorizer
# Create a CountVectorizer instance
vectorizer = CountVectorizer()

x=np.array(df['preprocessed2'])
y=np.array(df['is_sarcastic'])

X=vectorizer.fit_transform(x)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [25]:
from sklearn.linear_model import LogisticRegression
l=LogisticRegression()
l.fit(X_train,y_train)
#for logistic regression
y_pred = l.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.703925

In [26]:
from sklearn.naive_bayes import MultinomialNB
model =  MultinomialNB()
model.fit(X_train, y_train)
print(model.score(X_test, y_test))

0.69045


In [23]:
import xgboost as xgb
from sklearn.metrics import accuracy_score
v=xgb.XGBClassifier()#objective='multi:softmax' indicates that the model should perform multiclass classification, and num_class=len(set(y)) sets the number of classes based on the unique labels in your target variable (y).
v.fit(X_train,y_train)

In [24]:
# Calculate accuracy
y_pred=v.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of XgboostClassifier: {accuracy * 100:.2f}%")

Accuracy of XgboostClassifier: 65.80%


In [39]:
from sklearn.linear_model import LogisticRegression
l=LogisticRegression(C=1, max_iter=200, multi_class='multinomial',
                   warm_start=True,solver= 'saga',l1_ratio=0.5,penalty='elasticnet')
l.fit(X_train,y_train)
#for logistic regression
y_pred = l.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy

In [27]:
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [50, 100, 200, 500],
    'multi_class': ['ovr', 'multinomial'],
    'class_weight': [None, 'balanced'],
    'warm_start': [True, False],
}

penalty: Regularization term ('l1', 'l2', 'elasticnet', 'none').

C: Inverse of regularization strength. Smaller values specify stronger regularization.

solver: Algorithm to use in the optimization problem.

max_iter: Maximum number of iterations for the solver to converge.

multi_class: Option for handling multiple classes ('ovr' for binary, 'multinomial' for multiclass).

class_weight: Weights associated with classes. 'balanced' adjusts weights based on class frequencies.

warm_start: If True, reuse the solution of the previous call to fit as initialization.

In [29]:
from sklearn.model_selection import RandomizedSearchCV
l=LogisticRegression()
rf_randomcv=RandomizedSearchCV(estimator=l,param_distributions=param_grid,n_iter=100,cv=3,verbose=2,
                               random_state=100,n_jobs=-1)

In [31]:
rf_randomcv.fit(X_train,y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


126 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\jayen\Text-sentiment-analysis-general-purpose\text-sent\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\jayen\Text-sentiment-analysis-general-purpose\text-sent\lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\jayen\Text-sentiment-analysis-general-purpose\text-sent\lib\site-packages\sklearn\linear_model\_logistic.py", line 1169, in fit
    solver = _check_solver(self.solver, self.pen

now doing deep learnig lstm bidirectional

In [32]:
print(rf_randomcv.best_params_)

{'warm_start': True, 'solver': 'lbfgs', 'penalty': 'l2', 'multi_class': 'multinomial', 'max_iter': 200, 'class_weight': None, 'C': 1}


In [36]:
best_random_grid=rf_randomcv.best_estimator_
print(best_random_grid)

LogisticRegression(C=1, max_iter=200, multi_class='multinomial',
                   warm_start=True)


In [35]:
from sklearn.metrics import accuracy_score
y_pred=best_random_grid.predict(X_test)
#print(confusion_matrix(y_test,y_pred))
print("Accuracy Score {}".format(accuracy_score(y_test,y_pred)))

Accuracy Score 0.708225


In [None]:
import tensorflow
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Bidirectional
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['preprocessed'])
sequences = tokenizer.texts_to_sequences(df['preprocessed'])

In [None]:
max_length = 200  # Define your maximum sequence length
padded_sequences = pad_sequences(sequences, maxlen=max_length)#THIS WILL BE MY ACTUAL DATA THAT'LL TRAIN TEST AND SPLIT

In [None]:
Considers information from both past and future states, which can be beneficial for capturing context.

In [None]:
word_index = tokenizer.word_index#THESE ARE MY UNIQUE WORDS WITH ITS UNIQUE INDEX
num_words = len(word_index)+1
## Creating model
from tensorflow.keras.layers import Dropout
embedding_vector_features=200 ##features representation each word wil get convert into a vector of 300
model=Sequential()
model.add(Embedding(num_words,embedding_vector_features,input_length=max_length))
model.add(Dropout(0.3))#THIS MEANS IM ADDING 20 PERCENT OF CHANCE THAT ANY OF NODES WERE TO DROPOUT WHICH MEANS WE MAKE THEM ZERO VALUE WE ONLY USE THIS WHEN OUR MODEL IS PERFORMIG BAD THIS IS USED FOR STOPING OVERFITTING
model.add(Bidirectional(LSTM(50,return_sequences=True)))
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(50)))
model.add(Dense(8,activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

import numpy as np
X_final=np.array(padded_sequences)
y_final=np.array(df['is_sarcastic'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.2, random_state=42)

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='accuracy', mode='max', verbose=1, baseline=0.97)

In [None]:
model.fit(X_final,y_final, epochs=50, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

In [None]:
predicted_labels = [max(enumerate(pred), key=lambda x: x[1])[0] for pred in y_pred]

# Calculate accuracy
accuracy = accuracy_score(y_test, predicted_labels)
print(f'Accuracy: {accuracy}')

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred,y_test)