In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from fastai.text.all import *
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline


In [18]:
# Step 2: Load and inspect the dataset

# Load the dataset with specified encoding to handle special characters
twitter = pd.read_csv('/Users/muhammadhassanzahoor/Desktop/NEU/EAI 6010 - Applications of Artificial Intelligence/Module 3 - Text Classification with Transfer Learning/tweets.csv', encoding='ISO-8859-1')

# Inspect the data structure
print("Data preview:")
print(twitter.head())
print("Data columns:", twitter.columns)

# Take a random sample of 1000 rows
data = twitter.sample(n=1000, random_state=1)



Data preview:
   Target          ID                          Date      flag           User  \
0       0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY  scotthamilton   
1       0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY       mattycus   
2       0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY        ElleCTF   
3       0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY         Karoli   
4       0  1467811372  Mon Apr 06 22:20:00 PDT 2009  NO_QUERY       joy_wolf   

                                                                                                              Text  
0  is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!  
1                        @Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds  
2                                                                  my whole body feels itchy and like its on fire   
3  @nationwideclass no, it's not beha

In [20]:
#Step 3: Filter and CLean Data

# Keep only the relevant columns
data = data[['Target', 'Text']]

# Check for missing values and drop them if present
data = data.dropna(subset=['Text', 'Target'])

# Map the 'Target' values to meaningful labels for clarity
data['Target'] = data['Target'].map({0: 'negative', 4: 'positive'})

# Verify the changes
print(data.head())
print("Class distribution:\n", data['Target'].value_counts())

           Target  \
1045953  positive   
358409   negative   
875012   positive   
555046   negative   
272972   negative   

                                                                                                                                       Text  
1045953                           Decided to have a raffle to get rid of some stuff I don't need. LOTS OF PRIZES  http://tinyurl.com/n7tk9s  
358409                                        @nneoma I wanted Diversity to win  Flawless were good but Diversity were proper off the hook!  
875012   @ericanoriega wow really, u like the rain? i hate it lol...but its really weird that its raining here...it normally rains on july   
555046                                                               Hell traffic. Can't believe I have to go down punt road in a few mins   
272972                                    Woke up and can't go back to sleep even though I'm really tired. Lol And in a slightly bad mood.   
Class distribution:
 

In [22]:
#Step 4:  Split Data into Training and Validation Sets

# Split data into train and validation sets (80-20 split)
train_data, valid_data = train_test_split(data, test_size=0.2, random_state=42)

# Verify the size of the splits
print(f"Training set size: {len(train_data)}, Validation set size: {len(valid_data)}")

Training set size: 800, Validation set size: 200


In [24]:
#Step 5: Tokenization
from fastai.text.all import TextDataLoaders, SentencePieceTokenizer

# Specify tokenizer explicitly
tok = SentencePieceTokenizer()

# Create the language model data loader
dls_lm = TextDataLoaders.from_df(data, text_col='Text', is_lm=True, valid_pct=0.2, seed=42, tok_tfm=tok)

# Create the classifier data loader
dls_clas = TextDataLoaders.from_df(train_data, text_col='Text', label_col='Target', valid_df=valid_data, seed=42, tok_tfm=tok)

print("Data loaders created successfully.")

#Step 6: Fine tuning the model
# Fine-tuning the language model
learn_lm = language_model_learner(dls_lm, AWD_LSTM, drop_mult=0.5, metrics=[accuracy, Perplexity()])
learn_lm.fit_one_cycle(1, 2e-2)  # Initial fine-tuning cycle
learn_lm.unfreeze()
learn_lm.fit_one_cycle(5, 2e-3)  # Further training after unfreezing

# Save the fine-tuned model
learn_lm.save('fine_tuned_lm')
learn_lm.save_encoder('fine_tuned_encoder')
print("Language model fine-tuning complete.")


sentencepiece_trainer.cc(178) LOG(INFO) Running command: --input=tmp/texts.out --vocab_size=1112 --model_prefix=tmp/spm --character_coverage=0.99999 --model_type=unigram --unk_id=9 --pad_id=-1 --bos_id=-1 --eos_id=-1 --minloglevel=2 --user_defined_symbols=▁xxunk,▁xxpad,▁xxbos,▁xxeos,▁xxfld,▁xxrep,▁xxwrep,▁xxup,▁xxmaj --hard_vocab_limit=false


Data loaders created successfully.


  wgts = torch.load(wgts_fname, map_location = lambda storage,loc: storage)


epoch,train_loss,valid_loss,accuracy,perplexity,time
0,8.361176,6.534743,0.037073,688.656982,00:02


epoch,train_loss,valid_loss,accuracy,perplexity,time
0,6.224312,6.172145,0.049334,479.21286,00:02
1,5.965567,5.536211,0.083044,253.714859,00:02
2,5.776671,5.305614,0.101309,201.464661,00:01
3,5.617991,5.231066,0.109881,186.991989,00:01
4,5.515702,5.21386,0.112124,183.802094,00:01


Language model fine-tuning complete.


In [26]:
#Step 7: Training the text classifier

# Initialize the text classifier learner without loading the encoder
learn_clas = text_classifier_learner(dls_clas, AWD_LSTM, drop_mult=0.5, metrics=accuracy)

# Save the trained classifier model
learn_clas.save('text_classifier_model_from_scratch')
print("Text classifier training complete (trained from scratch).")

# Export the trained classifier for inference
learn_clas.export('text_classifier.pkl')
print("Text classifier model exported.")



# Train the classifier from scratch
learn_clas.fit_one_cycle(3, 2e-2)  # Initial training for the classifier
learn_clas.unfreeze()
learn_clas.fit_one_cycle(5, 2e-3)  # Further training after unfreezing

# Save the trained classifier model
learn_clas.save('text_classifier_model_from_scratch')
print("Text classifier training complete (trained from scratch).")



  wgts = torch.load(wgts_fname, map_location = lambda storage,loc: storage)


Text classifier training complete (trained from scratch).
Text classifier model exported.


epoch,train_loss,valid_loss,accuracy,time
0,0.774793,0.723497,0.4125,00:01
1,0.711383,0.537134,0.80625,00:00
2,0.670268,0.52091,0.8,00:00


epoch,train_loss,valid_loss,accuracy,time
0,0.572742,0.547163,0.7625,00:02
1,0.585633,0.575741,0.78125,00:02
2,0.561626,0.527168,0.7875,00:02
3,0.530579,0.555237,0.78125,00:02
4,0.491202,0.549697,0.79375,00:02


Text classifier training complete (trained from scratch).


In [28]:
#Step 8: Evaluate the Model

from sklearn.metrics import accuracy_score

# Get predictions and calculate accuracy
preds, targets = learn_clas.get_preds()
accuracy_dl = accuracy_score(targets, preds.argmax(dim=1))
print("Deep learning model accuracy:", accuracy_dl)


Deep learning model accuracy: 0.79375


In [36]:
# Step 9: Train a Traditional NLP Model (Naive Bayes)

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
import nltk

# Download NLTK stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords

# Define stopwords as a list
stop_words = list(stopwords.words('english'))

# Set up pipeline with CountVectorizer and Naive Bayes
nb_model = make_pipeline(
    CountVectorizer(stop_words=stop_words),
    MultinomialNB()
)

# Train the model
nb_model.fit(train_data['Text'], train_data['Target'])

# Set up pipeline with CountVectorizer and Naive Bayes
nb_model = make_pipeline(
    CountVectorizer(stop_words=stop_words),
    MultinomialNB()
)

# Train the model
nb_model.fit(train_data['Text'], train_data['Target'])


# Evaluate the model on the validation set
preds_nb = nb_model.predict(valid_data['Text'])
accuracy_nb = accuracy_score(valid_data['Target'], preds_nb)
print("Traditional Naive Bayes model accuracy:", accuracy_nb)


Traditional Naive Bayes model accuracy: 0.76


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/muhammadhassanzahoor/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [34]:
#step 10: Compre the Model Performance

print("Deep Learning Model (AWD_LSTM + ULMFiT) Accuracy:", accuracy_dl)
print("Traditional NLP Model (Naive Bayes) Accuracy:", accuracy_nb)


Deep Learning Model (AWD_LSTM + ULMFiT) Accuracy: 0.79375
Traditional NLP Model (Naive Bayes) Accuracy: 0.76
