In [2]:
import pandas as pd
import re  # Regular expression
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from nltk import word_tokenize, pos_tag
from sklearn.metrics import confusion_matrix, classification_report,  roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
DATASET_COLUMNS=['target','ids','date','flag','user','text']
DATASET_ENCODING="ISO-8859-1"
df=pd.read_csv("twitter_csv.csv",encoding=DATASET_ENCODING, names=DATASET_COLUMNS)

In [4]:
df

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
...,...,...,...,...,...,...
1048571,4,1960186342,Fri May 29 07:33:44 PDT 2009,NO_QUERY,Madelinedugganx,My GrandMa is making Dinenr with my Mum
1048572,4,1960186409,Fri May 29 07:33:43 PDT 2009,NO_QUERY,OffRoad_Dude,Mid-morning snack time... A bowl of cheese noo...
1048573,4,1960186429,Fri May 29 07:33:44 PDT 2009,NO_QUERY,Falchion,@ShaDeLa same here say it like from the Termi...
1048574,4,1960186445,Fri May 29 07:33:44 PDT 2009,NO_QUERY,jonasobsessedx,@DestinyHope92 im great thaanks wbuu?


In [5]:
df.isnull().sum()

target    0
ids       0
date      0
flag      0
user      0
text      0
dtype: int64

In [6]:
df.duplicated()

0          False
1          False
2          False
3          False
4          False
           ...  
1048571    False
1048572    False
1048573    False
1048574    False
1048575    False
Length: 1048576, dtype: bool

In [7]:
print(df['target'].value_counts())

target
0    800000
4    248576
Name: count, dtype: int64


In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Kiran
[nltk_data]     Meena\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
print(stop_words)

{"it'll", 'theirs', "wasn't", "haven't", 'no', 'shan', 'because', 'him', 'too', 'his', 'so', 'or', 'other', 'should', "should've", 'with', 'some', "shouldn't", 'll', "mightn't", 'until', 'once', 'been', 'further', 'if', 'this', 'will', "he'd", 'when', 'how', 'yours', "they'd", 'just', 'each', 'in', 'ours', 'me', 'can', "i've", "needn't", 'nor', 'now', 'their', 'off', "they're", 'were', 'which', 'himself', "won't", 'whom', 'is', 'both', 'having', "aren't", 'did', 'didn', 'myself', 'ma', 'that', 'then', 'isn', 'for', 'own', 'are', 'by', 'over', 'she', 'mightn', 'those', 'weren', 'doesn', "we've", "don't", 'won', "hasn't", "he'll", 'here', 'itself', 'about', 'during', 'had', 'he', 'while', 'its', 'more', "i'm", 'where', 'them', 'any', "they'll", "he's", "i'd", "she'd", 'below', 'wouldn', 'few', 'have', 'i', 'most', 'ain', 'my', 'your', 'yourself', "she's", 'all', "it'd", 'm', 'of', 'only', 'y', 'hasn', 'we', 'has', 'down', "weren't", 'was', 'into', 'same', 'there', 'through', 'as', 'at', 

In [10]:
def clean_text(text):
    #"@user I #love LOVE #AI and machine learning! Check out https://ai.example.com #ArtificalIntelligence"
    
    # Function to get NLTK POS tag to WordNet POS tag
    def get_wordnet_pos(treebank_tag):
        if treebank_tag.startswith('J'):
           return wordnet.ADJ
        elif treebank_tag.startswith('V'):
           return wordnet.VERB
        elif treebank_tag.startswith('N'):
           return wordnet.NOUN
        elif treebank_tag.startswith('R'):
           return wordnet.ADJ
        else:
           return wordnet.NOUN
     
     
    
    # convert to lowercase
    text=text.lower()
    
    #Remove URLS
    text=re.sub(r'((www\.[^\s]+)|(https?://[^\s]+))',' ',text)
    
    #Replace @mentions with 'USER'
    text=re.sub(r'@^\w+','USER',text)
    
    #REMOVE hashtags but keep the text
    text=re.sub(r'#(^\w+)+','r/1',text)
    
    #Remove digits
    text=re.sub(r'\d+','',text)
    
    #Remove extra whitespace
    text=re.sub(r'\s+',' ',text)
    
    # Strip leading and trailing whitespace
    text=text.strip()
    
    #Remove stopwords
    text=" ".join([word for word in text.split() if word not in stop_words])
    
    #Tokenize text
    tokenizer=RegexpTokenizer(r'\w+|[^\w\s]')
    tokens=tokenizer.tokenize(text)
    
    #pos position of speech
    post_tags=nltk.pos_tag(tokens)
    
    #Lemmatize the text
    lemmatizer=WordNetLemmatizer()
    lemmatized_tokens=[lemmatizer.lemmatize(token,get_wordnet_pos(tag)) for token, tag in post_tags]
    
    return " ".join(lemmatized_tokens)

In [11]:
df['target']=df['target'].replace(4,1)
print(df['target'].value_counts())

target
0    800000
1    248576
Name: count, dtype: int64


In [12]:
import nltk
nltk.download('averaged_perceptron_tagger')  # This will download the tagger in JSON format
nltk.download('punkt')  # Tokenizer, if not already downloaded


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Kiran Meena\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to C:\Users\Kiran
[nltk_data]     Meena\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [13]:
nltk.download('averaged_perceptron_tagger_eng')


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Kiran Meena\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [14]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to C:\Users\Kiran
[nltk_data]     Meena\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [15]:
df['text'] = df['text'].apply(clean_text)

In [16]:
#split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'],df['target'],test_size=0.2,random_state=42)

In [17]:
# Initialize the IfidVectorizer
vectorizer=TfidfVectorizer(max_features=500000,ngram_range=(1,2))

# Fit  and transform the training data
X_train_vect= vectorizer.fit_transform(X_train)

# Transform the testing data
X_test_vect=vectorizer.transform(X_test)

In [18]:
def evaluate_model(model):
    model.fit(X_train_vect, y_train)
    y_pred = model.predict(X_test_vect)
    print(classification_report(y_test, y_pred))

In [19]:
# Step 6: Training and evaluating models
# Logistic Regression
lr_model = LogisticRegression(C = 2, max_iter = 1000, n_jobs=-1)
evaluate_model(lr_model)

              precision    recall  f1-score   support

           0       0.87      0.94      0.90    160127
           1       0.73      0.53      0.62     49589

    accuracy                           0.84    209716
   macro avg       0.80      0.74      0.76    209716
weighted avg       0.84      0.84      0.83    209716



In [20]:
example_text="I #love @kiran"
cleaned_ex_data=clean_text(example_text)
vectorized_text=vectorizer.transform([cleaned_ex_data])
prediction=lr_model.predict(vectorized_text)
print(prediction)
                                     

[1]
