In [1]:
import pandas as pd
import re  # Regular expression
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from nltk import word_tokenize, pos_tag

In [2]:
DATASET_COLUMNS=['target','ids','date','flag','user','text']
DATASET_ENCODING="ISO-8859-1"
df=pd.read_csv("twitter_csv.csv",encoding=DATASET_ENCODING, names=DATASET_COLUMNS)

In [3]:
df

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
...,...,...,...,...,...,...
1048571,4,1960186342,Fri May 29 07:33:44 PDT 2009,NO_QUERY,Madelinedugganx,My GrandMa is making Dinenr with my Mum
1048572,4,1960186409,Fri May 29 07:33:43 PDT 2009,NO_QUERY,OffRoad_Dude,Mid-morning snack time... A bowl of cheese noo...
1048573,4,1960186429,Fri May 29 07:33:44 PDT 2009,NO_QUERY,Falchion,@ShaDeLa same here say it like from the Termi...
1048574,4,1960186445,Fri May 29 07:33:44 PDT 2009,NO_QUERY,jonasobsessedx,@DestinyHope92 im great thaanks wbuu?


In [4]:
print(df['target'].value_counts())

target
0    800000
4    248576
Name: count, dtype: int64


In [5]:
df['target']=df['target'].replace(4,1)
print(df['target'].value_counts())

target
0    800000
1    248576
Name: count, dtype: int64


In [6]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
print(stop_words)

{"they'd", 'whom', 'but', 's', 'yours', 'them', 'we', 'very', "we've", 'those', 'you', 'him', "couldn't", 'in', 'am', "aren't", 'or', 'against', 'than', "hasn't", 'yourselves', 'has', 'nor', 'down', "wasn't", 'can', 'a', "i'd", 'are', 'did', "you've", 'mightn', "we're", 'ain', 'about', 'doesn', "hadn't", 'don', "don't", 'some', 'wasn', "i'll", 'above', 'any', 'will', 'as', 'been', 'at', "he'll", 'an', 'so', "didn't", 'again', 'the', 'weren', 'up', 'with', 'had', 'this', 't', 'have', 'these', 'same', 'she', 'doing', 'it', 'its', 'aren', 'needn', 'be', 'only', 'all', "needn't", 'why', "you'll", "mustn't", 'hasn', 'he', "you'd", 'yourself', 'by', "wouldn't", 'his', 'do', 'for', "it'd", 'off', 'haven', "she's", 'wouldn', 'other', "haven't", 'further', 'they', 'ma', "mightn't", 'o', 'hadn', 'now', 'more', 'because', 'before', 'shouldn', 'each', 'm', 'both', 'themselves', 'from', 'having', 'below', 'most', "shan't", 'under', 'myself', 'my', 'once', 'where', 'll', 'too', 'shan', 'if', "she'll

In [24]:
def clean_text(text):
    "@user I #love LOVE #AI and machine learning! Check out https://ai.example.com #ArtificalIntelligence"
    
    # Function to get NLTK POS tag to WordNet POS tag
    def get_wordnet_pos(treebank_tag):
        if treebank_tag.startswith('J'):
           return wordnet.ADJ
        elif treebank_tag.startswith('V'):
           return wordnet.VERB
        elif treebank_tag.startswith('N'):
           return wordnet.NOUN
        elif treebank_tag.startswith('R'):
           return wordnet.ADJ
        else:
           return wordnet.NOUN
     
     
    
    # convert to lowercase
    text=text.lower()
    
    #Remove URLS
    text=re.sub(r'((www\.[^\s]+)|(https?://[^\s]+))',' ',text)
    
    #Replace @mentions with 'USER'
    text=re.sub(r'@[^\S]+','USER',text)
    
    #REMOVE hashtags but keep the text
    text=re.sub(r'#(^\S)+','r/1',text)
    
    #Remove digits
    text=re.sub(r'\d+','',text)
    
    #Remove extra whitespace
    text=re.sub(r'\s+',' ',text)
    
    # Strip leading and trailing whitespace
    text=text.strip()
    
    #Remove stopwords
    text=" ".join([word for word in text.split() if word not in stop_words])
    
    #Tokenize text
    tokenizer=RegexpTokenizer(r'w+|[^\w\s]')
    tokens=tokenizer.tokenize(text)
    
    #pos position of speech
    post_tags=nltk.pos_tag(tokens)
    
    #Lemmatize the text
    lemmatizer=WordNetLemmatizer()
    lemmatized_tokens=[lemmatizer.lemmatize(token,get_wordnet_pos(tag)) for token, tag in post_tags]
    
    return " ".join(lemmatized_tokens)

In [31]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package punkt to C:\Users\Kiran
[nltk_data]     Meena\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Kiran Meena\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to C:\Users\Kiran
[nltk_data]     Meena\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Kiran
[nltk_data]     Meena\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [29]:
import nltk
print(nltk.data.path)


['C:\\Users\\Kiran Meena/nltk_data', 'c:\\Users\\Kiran Meena\\AppData\\Local\\Programs\\Python\\Python312\\nltk_data', 'c:\\Users\\Kiran Meena\\AppData\\Local\\Programs\\Python\\Python312\\share\\nltk_data', 'c:\\Users\\Kiran Meena\\AppData\\Local\\Programs\\Python\\Python312\\lib\\nltk_data', 'C:\\Users\\Kiran Meena\\AppData\\Roaming\\nltk_data', 'C:\\nltk_data', 'D:\\nltk_data', 'E:\\nltk_data']


In [32]:
import nltk
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Kiran Meena\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [36]:
df['text'] = df['text'].apply(clean_text)

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\Kiran Meena\\AppData\\Roaming\\nltk_data\\taggers\\averaged_perceptron_tagger_eng\\averaged_perceptron_tagger_eng.weights.json'

In [34]:
#split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'],df['target'],test_size=0.2,random_state=42)

In [35]:
# Initialize the IfidVectorizer
vectorizer=TfidfVectorizer(max_features=500000,ngram_range=(1,2))

# Fit  and transform the training data
X_train_vect= vectorizer.fit_transform(X_train)

# Transform the testing data
X_test_vect=vectorizer.transform(X_test)