## Import

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 54.8 gigabytes of available RAM

You are using a high-RAM runtime!


In [None]:
!pip3 install emoji



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os, emoji, re, json
from typing import List
from string import punctuation
from sklearn.model_selection import train_test_split


In [None]:



CONFIGS = {'project':'transformer_model',
           'run_name':'run_21_10_2023', # run_01_01_2023
           "num_files":10,
           #'imbalance_handle':'upsample', # upsample/downsample
           "vector_size":200,
           "min_count":20,
           'window':10,
           'max_seq_length':100,
           'loss_fn':'categorical_crossentropy',
           'optimizer':'adam',
           'learning_rate':5e-5,
           'val_split':0.1,
           'bidirectional':True,
           'lstm_neurons':200,
           'classifier_actvn':'softmax',
           'max_epochs':500,
           'batch_size':32,
           'embed_dim': 128,  # Embedding size for each token
           "num_heads": 1,  # Number of attention heads
           'ff_dim': 64  # Hidden layer size in feed forward network inside transformer
           }

In [None]:
USE_FRESH_DATA = False

## Data Handler and Loading

In [None]:
class DataHandler:
    def __init__(self, root: str, label_column: str, text_column: str, file_ext: str = "xlsx",
                 old_vocab='/content/drive/MyDrive/Tweet Scraping/vocab.json',
                 min_count:int=200,
                 max_word_len:int=20, min_word_needed:int=3):
        self.root = root
        self.label_column = label_column
        self.text_column = text_column
        self.file_ext = file_ext
        self.vocab = {}
        self.old_vocab_path = old_vocab
        self.min_word_needed = min_word_needed
        self.filter_words = None
        self.min_count = min_count
        self.max_word_len = max_word_len
        if old_vocab is not None:
          with open(old_vocab) as fp:
            self.vocab = json.load(fp)
            less_freq_words = [k for k,v in self.vocab.items() if v<min_count]
            huge_words = [k for k in self.vocab.keys() if len(k)>max_word_len]
            self.filter_words = less_freq_words+huge_words
            self.vocab = {k:v for k,v in self.vocab.items() if k not in self.filter_words}



    def read_files(self, number_of_files: int = 10):
        all_filenames = [file_name for file_name in os.listdir(self.root)if file_name.split(".")[-1] == self.file_ext ][:number_of_files]

        combined_df = pd.concat([pd.read_excel(os.path.join(self.root, f)) for f in all_filenames])
        self.data = combined_df
        print(f"Read {len(all_filenames)} files. Read total {len(self.data)} rows.")
        print(f"Label Counts:\n {self.data[self.label_column].value_counts()}")
        return self



    def preprocess_tweet(self, tweet:str, noise:str, stop_words:List):
        noise = list(noise)

        ntweet = emoji.replace_emoji(tweet)
        pattern = re.compile(u"[\u200c-\u200f\u202a-\u202f\u2066-\u2069]")
        ntweet = pattern.sub('', ntweet)

        nntweet = ''
        for word in ntweet.split(" "):
          if '#' not in word:
            nntweet+=" "+word

        ntweet = nntweet

        for e in noise:
          ntweet = ntweet.lower().replace(e, " ")

        ntweet = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', " ", ntweet)
        ntweet = re.sub(r'\s', ' ', ntweet)
        tweet_token = [t.replace("\n", " ") for t in ntweet.split(" ")]
        tweet_token = [t for t in tweet_token if len(t.strip())>0]

        tweet_no_noise = tweet_token
        new_tweet = [t for t in tweet_no_noise if t not in stop_words]


        new_tweet = " ".join(new_tweet)
        new_tweet = " ".join([t for t in new_tweet.split(' ') if len(t)>1])


        if '●' in new_tweet:
          print(tweet)
          print(new_tweet)
          print("\n")

        new_tweet = new_tweet.strip()


        if self.old_vocab_path is None:
          for t in new_tweet.split(' '):
            if self.vocab.get(t) is not None:
              self.vocab[t]+=1
            else:
              self.vocab[t]=1


        return new_tweet


    def data_clean(self, stopwords_path: str,apply_filter:bool=False):
        noise = '~؟॥”▬`%´•●=+÷।–][{}*“_…‘’&#\/;@abcdefghijklmnopqrstuvwxyz1234567890०१२३४५६७८९( )-.|!?",:—?।'+"'"
        stop_file = stopwords_path
        stop_words = []
        with open(stop_file) as fp:
          lines = fp.readlines()
          stop_words = list( map(lambda x:x.strip(), lines))

        if apply_filter and len(self.filter_words)>0:
          stop_words+=self.filter_words

        self.data['clean_tokenized_text'] = self.data[self.text_column].apply(lambda x: self.preprocess_tweet(x, noise, stop_words))
        self.data = self.data.drop_duplicates(subset=['clean_tokenized_text'])


        sorted_vocab = sorted(self.vocab.items(), key=lambda x:x[1])
        self.vocab = dict(sorted_vocab)
        if apply_filter:
          if len(self.filter_words)==0:
            less_freq_words = [k for k,v in self.vocab.items() if v<self.min_count]
            huge_words = [k for k in self.vocab.keys() if len(k)>self.max_word_len]
            self.filter_words = less_freq_words+huge_words

            self.data['clean_tokenized_text'] = self.data.clean_tokenized_text.apply(lambda x: self.filter_txt(x, self.filter_words))
            self.vocab = {k:v for k,v in self.vocab.items() if k not in self.filter_words}

        self.vocab = {k:v for k,v in self.vocab.items() if k not in self.filter_words}

        self.data = self.data[self.data.clean_tokenized_text.apply(lambda x: len(x.strip(" "))>=self.min_word_needed)]
        print(f"Cleaned data: {len(self.data)}, Vocab: {len(self.vocab)}")

        return self

    def filter_txt(self, text, filter_words):
      for w in filter_words:
        text = text.replace(w,'')


      return text.strip()


    def train_test_split(self, train_ratio:float=0.8, test_ratio:float=0.1,
                         valid_ratio:float=0.1):

        data=self.data
        labels = np.array(data_handler.data.label.to_list())
        X_train, X_test, Y_train, Y_test = train_test_split(data['clean_tokenized_text'],
                                                        labels,
                                                        random_state=0, train_size=train_ratio)
        X_valid, X_test, Y_valid, Y_test = train_test_split(X_test, Y_test, train_size=valid_ratio)

        return X_train, X_test, X_valid, Y_train, Y_test, Y_valid

    def handle_imbalance(self, how="downsample", labels=[1,2]):
      temp_df = self.data[self.data.label.isin(labels)].copy()

      majority_class = temp_df.label.mode().tolist()[0]
      print(f'\nMajority class is: {majority_class}.')

      major_df = temp_df[temp_df.label==majority_class]
      minor_df = temp_df[temp_df.label!=majority_class]
      num_diff = len(minor_df)
      print(f"Majority samples: {len(major_df)} and Minority Samples: {len(minor_df)}")



      if how=='downsample':
        redf = major_df.sample(n=num_diff)
        tdf = pd.concat([redf, minor_df])
      elif how=='upsample':
        redf = minor_df.sample(n=len(major_df)-num_diff)
        tdf = pd.concat([redf, minor_df, major_df])
      else:
        print('Handling imbalance not recognized. Step skipped.')
        return self

      self.data=tdf
      print(f'After handling imbalance by {how}: {self.data.label.value_counts()}')
      return self

    def convert_label(self, kind:str='ohe', labels=[1,2], ref={1:0, 2:1}):
        self.data = self.data[self.data.label.isin(labels)]
        self.label_ref = ref
        if ref is not None:
            self.data.label = self.data.label.apply(ref.get)
        else:
            if kind=='ohe':
                ohe = {}
                for i,l in enumerate(labels):
                    ohl = np.zeros(len(labels))
                    ohl[i] = 1
                    ohe[l]=ohl
                self.data.label = self.data.label.apply(ohe.get)
        return self

    def __iter__(self):
      for i, row in self.data.iterrows():
        yield row.clean_tokenized_text.split(' ')

    # def iter_xy(self, seq_length:int=100):
    #   for i, row in self.data.iterrows():
    #     words = np.array([word2token(w) for w in row.clean_tokenized_text[:seq_length]])

    #     yield words, row.label



In [None]:
# import json

# with open('vocab.json', 'w', encoding='utf8') as fp:
#   json.dump(data_handler.vocab, fp,ensure_ascii=False)

In [None]:
data_handler = DataHandler(root='/content/drive/MyDrive/Tweet Scraping/Label', label_column='label', text_column='text', min_count=2)

# data_handler.read_files(10).convert_label().data_clean(stopwords_path= "/content/drive/MyDrive/Tweet Scraping/Resources/nepali_stop_words.txt")

# OHE for labels
# data_handler.read_files(CONFIGS['num_files']).convert_label(ref={1:[1, 0], 2:[0, 1]}).data_clean(stopwords_path= "/content/drive/MyDrive/Tweet Scraping/Resources/nepali_stop_words.txt")

#
if USE_FRESH_DATA:
  data_handler.read_files(CONFIGS['num_files']).convert_label(ref={1:0, 2:1}).data_clean(apply_filter=True,  stopwords_path= "/content/drive/MyDrive/Tweet Scraping/Resources/stop_words_nepali_25_10_2023.txt") #.handle_imbalance(labels=[0,1])
  import json
  with open('/content/drive/MyDrive/Tweet Scraping/temp_vocab.json', 'w', encoding='utf8') as fp:
    json.dump(data_handler.vocab, fp,ensure_ascii=False)

  data_handler.data.to_csv('/content/drive/MyDrive/Tweet Scraping/temp_clean_data.csv')

else:
  import json
  with open("/content/drive/MyDrive/Tweet Scraping/temp_vocab.json") as fp:
    vocab = json.load(fp)

  filtered_data = pd.read_csv("/content/drive/MyDrive/Tweet Scraping/temp_clean_data.csv")
  data_handler.data = filtered_data
  data_handler.vocab = vocab
  data_handler = data_handler.handle_imbalance(labels=[0,1], how='upsample')



Majority class is: 0.
Majority samples: 45771 and Minority Samples: 27199
After handling imbalance by upsample: 1    45771
0    45771
Name: label, dtype: int64


In [None]:
# purge max repeated words
num_max_repeated_words = 100000
max_repeated_words = [k for k,v in data_handler.vocab.items() if v>num_max_repeated_words]
filtered_data = data_handler.data.copy()
new_vocab = {k:v for k, v in data_handler.vocab.items() if k not in max_repeated_words}
filtered_data['clean_tokenized_text'] = filtered_data.clean_tokenized_text.apply(lambda x: data_handler.filter_txt(x, max_repeated_words))


In [None]:
filtered_data = filtered_data[filtered_data.clean_tokenized_text.apply(lambda x: len(x.split(' ')))>1]


### Vocab
Legth of vocab and least/most popular vocab.

In [None]:
vocab = new_vocab #data_handler.vocab
print(f"Total number of words in vocabulary: {len(vocab)}")


Total number of words in vocabulary: 48832


In [None]:
vdf = pd.DataFrame([[k, v] for k, v in vocab.items()] , columns=['word', 'counts'], index = np.arange(len(vocab)))
vdf = vdf.sort_values('counts')
vdf

Unnamed: 0,word,counts
0,शाहहरुको,2
10503,पंचायतका,2
10504,ग्यांगलाई,2
10505,मैवाखोला,2
10506,दुहुँमा,2
...,...,...
48827,देउवा,6047
48828,मेयर,6274
48829,नेपाली,10864
48830,निर्वाचन,12482


In [None]:
fname = ['nepali_stop_words.txt', 'NLP_stop_words.txt', 'stop_words_nepali_25_10_2023.txt'][2]
with open('/content/drive/MyDrive/Tweet Scraping/Resources/'+fname) as fp:
  stop_words = [s.strip() for s in fp.readlines()]

### How many times certain word repeated in Negative/Positive class?

In [None]:
vocab_class = {1:{}, 0:{}}
vocab_words = list(vocab.keys())

for i, row in filtered_data.iterrows():
  vc=vocab_class[row.label]

  for word in row.clean_tokenized_text.split(" "):
    if vc.get(word) is None:
      vc[word]=1
    else:
      vc[word]+=1
  vocab_class[row.label].update(vc)
  # if i>100:
  #   break

In [None]:
vocab_class

{1: {'सत्तारुढ': 254,
  'गठबन्धनले': 276,
  'दलीय': 117,
  'गठबन्धन': 1782,
  'बाहिरका': 22,
  'दलसँग': 19,
  'चुनावी': 714,
  'तालमेल': 204,
  'खारेज': 485,
  'निर्णय': 316,
  'भोट': 10904,
  'फलानो': 74,
  'चिन्हमा': 158,
  'सुनियो': 50,
  'मतपत्र': 333,
  'जानकारि': 5,
  'ज्यादै': 6,
  'न्युन': 6,
  'पाइयो': 41,
  'कलंकी': 6,
  'आशै': 2,
  'नगरे': 102,
  'स्थापित': 407,
  'बाले': 74,
  'जमानामा': 22,
  'उती': 2,
  'उफ्रेका': 7,
  'मतगणना': 232,
  'सकिएपछि': 23,
  'घाममा': 41,
  'सुत्ने': 18,
  'गयो': 178,
  'पालो': 122,
  'बुढाको': 9,
  'देख्यौ': 10,
  'जनताले': 986,
  'हाले': 258,
  'कम्युनिस्ट': 270,
  'राप्रपा': 911,
  'माग्ने': 305,
  'अधक्ष्य': 8,
  'हुदै': 95,
  'कांग्रेसलाइ': 15,
  'हाल्न': 848,
  'हालेर': 310,
  'राजा': 296,
  'चाहियो': 106,
  'पातकी': 12,
  'बहुरुपी': 4,
  'हुनुभन्दा': 12,
  'कांग्रेस': 2482,
  'हाल्ने': 634,
  'गर्व': 24,
  'गर्नुस': 100,
  'माग्न': 631,
  'आउँदा': 76,
  'वर्षलाई': 4,
  'आउ': 63,
  'तिहारमा': 25,
  'नाचगान': 33,
  'यीनले': 21,
  'हसियाँ': 

In [None]:
vocab_class.keys()

dict_keys([1, 0])

### Most Repeated Words in Each Labels

In [None]:
vocab_class[1] = dict(sorted(vocab_class[1].items(), key=lambda x:x[1], reverse=True))
vocab_class[0] = dict(sorted(vocab_class[0].items(), key=lambda x:x[1], reverse=True))

In [None]:

data = [list(vocab_class[1].keys()), [1]*len(vocab_class[1]), list(vocab_class[1].values())]

vocab_class_df = pd.DataFrame(columns=['word', 'label', 'count'])
vocab_class_df['word']=list(vocab_class[1].keys())+list(vocab_class[0].keys())
vocab_class_df['label']=[1]*len(vocab_class[1])+[0]*len(vocab_class[0])
vocab_class_df['count']=list(vocab_class[1].values())+list(vocab_class[0].values())


vocab_class_df

Unnamed: 0,word,label,count
0,भोट,1,10904
1,निर्वाचन,1,5221
2,नेपाली,1,5134
3,देउवा,1,4109
4,मेयर,1,2686
...,...,...,...
89197,पौड्यालले,0,1
89198,जाेगाउने,0,1
89199,जुत्ताको,0,1
89200,मालाले,0,1


In [None]:
vocab_class_df.query("label==1").iloc[:10]


Unnamed: 0,word,label,count
0,भोट,1,10904
1,निर्वाचन,1,5221
2,नेपाली,1,5134
3,देउवा,1,4109
4,मेयर,1,2686
5,चुनाव,1,2630
6,उम्मेदवारी,1,2488
7,कांग्रेस,1,2482
8,पार्टी,1,2355
9,माओवादी,1,2032


In [None]:
clean_text = []
text_label = []
found_word = []
# 1, 0: pos, neg

for label in [0,1]:
  words= vocab_class_df.query("label==@label").iloc[:1].word.tolist()
  for word in words:
    for i, row in filtered_data.query('label==@label').iterrows():
      if word in row.clean_tokenized_text:
        clean_text.append(row.clean_tokenized_text)
        found_word.append(word)
        text_label.append(label)
        break

  words= vocab_class_df.query("label==@label").iloc[-100:-98].word.tolist()
  for word in words:
    for i, row in filtered_data.query('label==@label').iterrows():
      if word in row.clean_tokenized_text:
        clean_text.append(row.clean_tokenized_text)
        found_word.append(word)
        text_label.append(label)
        break

vcdf = pd.DataFrame(columns=['clean_text', 'text_label', 'found_word'])
vcdf['clean_text'] = clean_text
vcdf['text_label'] = text_label
vcdf['found_word'] = found_word

vcdf

Unnamed: 0,clean_text,text_label,found_word
0,मतदाताले कांग्रेसलाई भोट युवाको हातमा पार्टी ज...,0,भोट
1,गज्जब भएछ नेपालको न्यायलयले साधारण नेपाली संबै...,0,जिताउन्
2,गाउँ राजनितिमा बर्शे ठिटो बर्षको गवार बर्षका व...,0,ठिटो
3,भोट फलानो चिन्हमा सुनियो मतपत्र भोट जानकारि ज्...,1,भोट
4,जबसम्म ह्याङ्ग दाईले जम्मा पारेर दाई रुदै भिडी...,1,फिल्मलाई
5,विश्वप्रकाश गठबन्धन ताेला सुन क्वीन्टल आलु,1,ताेला


In [None]:
vcdf.to_csv('vcdf.csv')

# Using TFiDF

In [None]:
filtered_data

Unnamed: 0.1,Unnamed: 0,id,text,label,Unnamed: 3,clean_tokenized_text
63512,6550,1519923310947373056,सत्तारुढ गठबन्धनले पाँच दलीय गठबन्धन बाहिरका द...,1,,सत्तारुढ गठबन्धनले दलीय गठबन्धन बाहिरका दलसँग ...
59575,1950,1522612435114287104,भोट के मा फलानो चिन्हमा भन्ने मात्रै सुनियो तर...,1,,भोट फलानो चिन्हमा सुनियो मतपत्र भोट जानकारि ज्...
29674,733,1526560690055500032,@realpbhattarai १४ हाम्रो यो कलंकी area को आशै...,1,,कलंकी आशै नगरे स्थापित बाले जमानामा उती भोट उफ...
15413,1520,1588871568620675072,@kandeldai हैट कम्युनिस्ट भएर राप्रपा लाइ भोट ...,1,,कम्युनिस्ट राप्रपा भोट माग्ने अधक्ष्य हुदै कां...
57544,8443,1522955026104160000,भोट माग्न आउँदा ५ वर्षलाई कमसेकम ५ लाख जति लिए...,1,,भोट माग्न आउँदा वर्षलाई आउ तिहारमा नाचगान
...,...,...,...,...,...,...
72913,8820,1519346755158298880,दशकौंदेखि नेका.का.क्षेत्र ३को अभिभावक रहेको अम...,0,,अभिभावक अम्बिका बस्नेत केन्द्रिय सदस्यबाट राजि...
72959,8886,1519237744316329984,पार्टीले टिकट दिएन भनेर पार्टीबाट राजीनामा दिन...,0,,टिकट दिएन पार्टीबाट राजीनामा पार्टी अवसरवादी त...
72967,8897,1520662751265260032,किसानो के ढ्वार योगी सरकार https://t.co/83zyxZ...,0,,योगी सरकार
72968,8898,1520662742067080960,@KiranjainReal #हर____हर___महादेव 🙏🚩🚩\nजय श्री...,0,,जय महाकाल सरकार


In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(" "),
                                         sublinear_tf=True, encoding='utf-8',
                                         decode_error='ignore',
                                         stop_words=stop_words,
                                   vocabulary=vocab.keys()
                                   )


# get tfidf of all data
X_tfidf = tfidf_vectorizer.fit_transform(filtered_data['clean_tokenized_text'])



In [None]:
X_train_tfidf, X_test_tfidf, Y_train, Y_test = train_test_split(X_tfidf,
                                                        filtered_data['label'],
                                                        random_state=111, train_size=0.8)
X_train_tfidf.shape

(72967, 48832)

In [None]:
# now do split
# X_train_tfidf, X_test_tfidf, Y_train, Y_test = train_test_split(X_tfidf,
#                                                         filtered_data['label'],
#                                                         random_state=111, train_size=0.8)
# X_tfidf = tfidf_vectorizer.fit_transform(train_rows['clean_tokenized_text'])
# X_train_tfidf, X_test_tfidf = X_tfidf, tfidf_vectorizer.transform(test_rows['clean_tokenized_text'])
# Y_train, Y_test = train_rows['label'], test_rows['label']

In [None]:
X_test_tfidf.shape

(18242, 48832)

## TFIDF Features

In [None]:
feature_names = tfidf_vectorizer.get_feature_names_out()
feature_names[0]

'शाहहरुको'

In [None]:
tfidf_df = pd.DataFrame(X_tfidf.toarray(), index=filtered_data['clean_tokenized_text'], columns=feature_names)
tfidf_df

Unnamed: 0_level_0,शाहहरुको,दगुरेको,कुम्भकरण,बताइदिनु,लठैतहरुको,निसाफ,ताउलोमा,हाजिरि,तिनलाख,दाइजोका,...,चुनाव,स्थानीय,उम्मेदवारी,उम्मेदवार,कांग्रेस,देउवा,मेयर,नेपाली,निर्वाचन,भोट
clean_tokenized_text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
सत्तारुढ गठबन्धनले दलीय गठबन्धन बाहिरका दलसँग चुनावी तालमेल खारेज निर्णय,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000
भोट फलानो चिन्हमा सुनियो मतपत्र भोट जानकारि ज्यादै न्युन पाइयो,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.188576
कलंकी आशै नगरे स्थापित बाले जमानामा उती भोट उफ्रेका मतगणना सकिएपछि घाममा सुत्ने गयो पालो बुढाको पालो स्थापित देख्यौ जनताले भोट हाले,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.122589
कम्युनिस्ट राप्रपा भोट माग्ने अधक्ष्य हुदै कांग्रेसलाइ भोट हाल्न राप्रपा भोट हालेर राजा चाहियो पातकी बहुरुपी कम्युनिस्ट हुनुभन्दा कांग्रेस भोट हाल्ने कम्युनिस्ट गर्व गर्नुस,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.114274,0.0,0.0,0.0,0.0,0.185184
भोट माग्न आउँदा वर्षलाई आउ तिहारमा नाचगान,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.135629
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
अभिभावक अम्बिका बस्नेत केन्द्रिय सदस्यबाट राजिनामा दिनु नातावाद कृपावाद पैसाको राजनिती रुपान्तरणको युवा पुस्ताको संघर्ष कायमै रहन्छ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000
टिकट दिएन पार्टीबाट राजीनामा पार्टी अवसरवादी तीनिहरु पार्टीमा उपयुक्त चुनाव आउँछ असली,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.168535,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000
योगी सरकार,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000
जय महाकाल सरकार,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000


In [None]:
filtered_data['len_text'] = filtered_data.clean_tokenized_text.apply(lambda x: len(x.split(" ")))
special_df = filtered_data.sort_values('len_text').loc[[1,2]]
special_df

Unnamed: 0.1,Unnamed: 0,id,text,label,Unnamed: 3,clean_tokenized_text,len_text
1,1,1601226348013707008,@KanakManiDixit @Himal_Khabar अब नयाँ जनप्रतिन...,1,,जनप्रतिनिधि सर्वोच्च न्यायाधीश महाभियोग हटाउनु,5
1,1,1601226348013707008,@KanakManiDixit @Himal_Khabar अब नयाँ जनप्रतिन...,1,,जनप्रतिनिधि सर्वोच्च न्यायाधीश महाभियोग हटाउनु,5
2,2,1601225158538117120,@damasebijaya च्याखे थापेर करारका जनप्रतिनिधिन...,1,,च्याखे थापेर अत्याचार निजामती पास जनप्रतिनिधि ...,11
2,2,1601225158538117120,@damasebijaya च्याखे थापेर करारका जनप्रतिनिधिन...,1,,च्याखे थापेर अत्याचार निजामती पास जनप्रतिनिधि ...,11


In [None]:
tmpdf = tfidf_df.loc[special_df.clean_tokenized_text]
tmpdf = tmpdf[tmpdf.columns[(tmpdf > 0.01).any()]].reset_index()
tmpdf = pd.merge(tmpdf, special_df, on='clean_tokenized_text').drop_duplicates()
tmpdf = tmpdf[[c for c in tmpdf.columns if 'Unnamed' not in c]]
tmpdf

Unnamed: 0,clean_tokenized_text,निजामती,च्याखे,थापेर,हटाउनु,करार,झुठो,अत्याचार,महाभियोग,न्यायाधीश,बोलेर,सर्वोच्च,पास,पद,जनप्रतिनिधि,id,text,label,len_text
0,जनप्रतिनिधि सर्वोच्च न्यायाधीश महाभियोग हटाउनु,0.0,0.0,0.0,0.520596,0.0,0.0,0.0,0.477028,0.472018,0.0,0.412148,0.0,0.0,0.329783,1601226348013707008,@KanakManiDixit @Himal_Khabar अब नयाँ जनप्रतिन...,1,5
8,च्याखे थापेर अत्याचार निजामती पास जनप्रतिनिधि ...,0.343676,0.334846,0.334846,0.0,0.32943,0.320601,0.314631,0.0,0.0,0.286632,0.0,0.420518,0.215414,0.204805,1601225158538117120,@damasebijaya च्याखे थापेर करारका जनप्रतिनिधिन...,1,11


In [None]:
tmpdf.to_csv('tmp.csv')

## Train Logistic Regression

In [None]:

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, accuracy_score, classification_report

# Train Logistic Regression Classifier
classifier = LogisticRegression()
classifier.fit(X_train_tfidf, Y_train)

# Make Predictions
y_pred_train = classifier.predict(X_train_tfidf)
y_pred = classifier.predict(X_test_tfidf)


print("Traning Performance")
accuracy = accuracy_score(Y_train, y_pred_train)
print(f"Accuracy: {accuracy:.4f}")


precision = precision_score(Y_train, y_pred_train)
f1 = f1_score(Y_train, y_pred_train)
recall = recall_score(Y_train, y_pred_train)
confusion = confusion_matrix(Y_train, y_pred_train)
print("Precision: " , precision)
print("F1 score: ",f1)
print("Recall score: ", recall)
print("Confusion Matrix:\n ", confusion)

report = classification_report(Y_train, y_pred_train)

print("Classification Report:\n", report)



# Evaluate the Model

print("Testing Performance")
accuracy = accuracy_score(Y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")


precision = precision_score(Y_test, y_pred)
f1 = f1_score(Y_test, y_pred)
recall = recall_score(Y_test, y_pred)
confusion = confusion_matrix(Y_test, y_pred)
print("Precision: " , precision)
print("F1 score: ",f1)
print("Recall score: ", recall)
print("Confusion Matrix:\n ", confusion)

report = classification_report(Y_test, y_pred)

print("Classification Report:\n", report)

Traning Performance
Accuracy: 0.8476
Precision:  0.829012185636505
F1 score:  0.851882934367049
Recall score:  0.8760513986684567
Confusion Matrix:
  [[29873  6595]
 [ 4524 31975]]
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.82      0.84     36468
           1       0.83      0.88      0.85     36499

    accuracy                           0.85     72967
   macro avg       0.85      0.85      0.85     72967
weighted avg       0.85      0.85      0.85     72967

Testing Performance
Accuracy: 0.7671
Precision:  0.751216482037478
F1 score:  0.7735607675906183
Recall score:  0.797275024722558
Confusion Matrix:
  [[6738 2403]
 [1845 7256]]
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.74      0.76      9141
           1       0.75      0.80      0.77      9101

    accuracy                           0.77     18242
   macro avg       0.77      0.77      0.77     1

## Train Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, accuracy_score, classification_report


# Train Decision Tree Classifier
classifier = DecisionTreeClassifier()
classifier.fit(X_train_tfidf, Y_train)

# Make Predictions
y_pred_train = classifier.predict(X_train_tfidf)
y_pred = classifier.predict(X_test_tfidf)


print("Traning Performance")
accuracy = accuracy_score(Y_train, y_pred_train)
print(f"Accuracy: {accuracy:.4f}")


precision = precision_score(Y_train, y_pred_train)
f1 = f1_score(Y_train, y_pred_train)
recall = recall_score(Y_train, y_pred_train)
confusion = confusion_matrix(Y_train, y_pred_train)
print("Precision: " , precision)
print("F1 score: ",f1)
print("Recall score: ", recall)
print("Confusion Matrix:\n ", confusion)

report = classification_report(Y_train, y_pred_train)

print("Classification Report:\n", report)






# Evaluate the Model

print("Testing Performance")
accuracy = accuracy_score(Y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")


precision = precision_score(Y_test, y_pred)
f1 = f1_score(Y_test, y_pred)
recall = recall_score(Y_test, y_pred)
confusion = confusion_matrix(Y_test, y_pred)
print("Precision: " , precision)
print("F1 score: ",f1)
print("Recall score: ", recall)
print("Confusion Matrix:\n ", confusion)

report = classification_report(Y_test, y_pred)

print("Classification Report:\n", report)

Traning Performance
Accuracy: 0.9994
Precision:  0.9996436208125445
F1 score:  0.9993559614673116
Recall score:  0.9990684676292502
Confusion Matrix:
  [[36455    13]
 [   34 36465]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     36468
           1       1.00      1.00      1.00     36499

    accuracy                           1.00     72967
   macro avg       1.00      1.00      1.00     72967
weighted avg       1.00      1.00      1.00     72967

Testing Performance
Accuracy: 0.7869
Precision:  0.7542182775772944
F1 score:  0.7991112948227757
Recall score:  0.8496868475991649
Confusion Matrix:
  [[6621 2520]
 [1368 7733]]
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.72      0.77      9141
           1       0.75      0.85      0.80      9101

    accuracy                           0.79     18242
   macro avg       0.79      0.79      0.79  

## Train NaiveBayes

In [None]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, accuracy_score, classification_report


# Train Binary Naive Bayes Classifier
classifier = BernoulliNB()
classifier.fit(X_train_tfidf, Y_train)

# Make Predictions
y_pred_train = classifier.predict(X_train_tfidf)
y_pred = classifier.predict(X_test_tfidf)


print("Traning Performance")
accuracy = accuracy_score(Y_train, y_pred_train)
print(f"Accuracy: {accuracy:.4f}")


precision = precision_score(Y_train, y_pred_train)
f1 = f1_score(Y_train, y_pred_train)
recall = recall_score(Y_train, y_pred_train)
confusion = confusion_matrix(Y_train, y_pred_train)
print("Precision: " , precision)
print("F1 score: ",f1)
print("Recall score: ", recall)
print("Confusion Matrix:\n ", confusion)

report = classification_report(Y_train, y_pred_train)

print("Classification Report:\n", report)

# Evaluate the Model

print("Testing Performance")
accuracy = accuracy_score(Y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")


precision = precision_score(Y_test, y_pred)
f1 = f1_score(Y_test, y_pred)
recall = recall_score(Y_test, y_pred)
confusion = confusion_matrix(Y_test, y_pred)
print("Precision: " , precision)
print("F1 score: ",f1)
print("Recall score: ", recall)
print("Confusion Matrix:\n ", confusion)

report = classification_report(Y_test, y_pred)

print("Classification Report:\n", report)

Traning Performance
Accuracy: 0.8218
Precision:  0.8111558427799554
F1 score:  0.8248872129822906
Recall score:  0.8390914819584098
Confusion Matrix:
  [[29338  7130]
 [ 5873 30626]]
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.80      0.82     36468
           1       0.81      0.84      0.82     36499

    accuracy                           0.82     72967
   macro avg       0.82      0.82      0.82     72967
weighted avg       0.82      0.82      0.82     72967

Testing Performance
Accuracy: 0.7505
Precision:  0.7369298062903562
F1 score:  0.7566700529326847
Recall score:  0.777496978354027
Confusion Matrix:
  [[6615 2526]
 [2025 7076]]
Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.72      0.74      9141
           1       0.74      0.78      0.76      9101

    accuracy                           0.75     18242
   macro avg       0.75      0.75      0.75   

## Train Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, accuracy_score, classification_report


# Train Random Forest Classifier
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train_tfidf, Y_train)

# Make Predictions
y_pred_train = classifier.predict(X_train_tfidf)
y_pred = classifier.predict(X_test_tfidf)


print("Traning Performance")
accuracy = accuracy_score(Y_train, y_pred_train)
print(f"Accuracy: {accuracy:.4f}")


precision = precision_score(Y_train, y_pred_train)
f1 = f1_score(Y_train, y_pred_train)
recall = recall_score(Y_train, y_pred_train)
confusion = confusion_matrix(Y_train, y_pred_train)
print("Precision: " , precision)
print("F1 score: ",f1)
print("Recall score: ", recall)
print("Confusion Matrix:\n ", confusion)

report = classification_report(Y_train, y_pred_train)

print("Classification Report:\n", report)


# Evaluate the Model

print("Testing Performance")
accuracy = accuracy_score(Y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")


precision = precision_score(Y_test, y_pred)
f1 = f1_score(Y_test, y_pred)
recall = recall_score(Y_test, y_pred)
confusion = confusion_matrix(Y_test, y_pred)
print("Precision: " , precision)
print("F1 score: ",f1)
print("Recall score: ", recall)
print("Confusion Matrix:\n ", confusion)

report = classification_report(Y_test, y_pred)

print("Classification Report:\n", report)

Traning Performance
Accuracy: 0.9994
Precision:  0.9993424657534247
F1 score:  0.9993561555637749
Recall score:  0.9993698457491986
Confusion Matrix:
  [[36444    24]
 [   23 36476]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     36468
           1       1.00      1.00      1.00     36499

    accuracy                           1.00     72967
   macro avg       1.00      1.00      1.00     72967
weighted avg       1.00      1.00      1.00     72967

Testing Performance
Accuracy: 0.8170
Precision:  0.7799261562378547
F1 score:  0.8278244727478987
Recall score:  0.8819909900010988
Confusion Matrix:
  [[6876 2265]
 [1074 8027]]
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.75      0.80      9141
           1       0.78      0.88      0.83      9101

    accuracy                           0.82     18242
   macro avg       0.82      0.82      0.82  

## Train SVM

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, accuracy_score, classification_report


# Train SVM Classifier
classifier = SVC(kernel='linear', C=1.0, random_state=42)
classifier.fit(X_train_tfidf, Y_train)

# Make Predictions
y_pred_train = classifier.predict(X_train_tfidf)
y_pred = classifier.predict(X_test_tfidf)


print("Traning Performance")
accuracy = accuracy_score(Y_train, y_pred_train)
print(f"Accuracy: {accuracy:.4f}")


precision = precision_score(Y_train, y_pred_train)
f1 = f1_score(Y_train, y_pred_train)
recall = recall_score(Y_train, y_pred_train)
confusion = confusion_matrix(Y_train, y_pred_train)
print("Precision: " , precision)
print("F1 score: ",f1)
print("Recall score: ", recall)
print("Confusion Matrix:\n ", confusion)

report = classification_report(Y_train, y_pred_train)

print("Classification Report:\n", report)

# Evaluate the Model

print("Testing Performance")
accuracy = accuracy_score(Y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")


precision = precision_score(Y_test, y_pred)
f1 = f1_score(Y_test, y_pred)
recall = recall_score(Y_test, y_pred)
confusion = confusion_matrix(Y_test, y_pred)
print("Precision: " , precision)
print("F1 score: ",f1)
print("Recall score: ", recall)
print("Confusion Matrix:\n ", confusion)

report = classification_report(Y_test, y_pred)

print("Classification Report:\n", report)

Traning Performance
Accuracy: 0.8807
Precision:  0.8618705410612925
F1 score:  0.8838128295841399
Recall score:  0.9069015589468205
Confusion Matrix:
  [[31163  5305]
 [ 3398 33101]]
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.85      0.88     36468
           1       0.86      0.91      0.88     36499

    accuracy                           0.88     72967
   macro avg       0.88      0.88      0.88     72967
weighted avg       0.88      0.88      0.88     72967

Testing Performance
Accuracy: 0.7760
Precision:  0.7589341045238587
F1 score:  0.7824096257253901
Recall score:  0.8073838039775849
Confusion Matrix:
  [[6807 2334]
 [1753 7348]]
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.74      0.77      9141
           1       0.76      0.81      0.78      9101

    accuracy                           0.78     18242
   macro avg       0.78      0.78      0.78  

## SVM RBF

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, accuracy_score, classification_report

# Train SVM Classifier
classifier = SVC(kernel='linear', C=1.5, gamma='auto', random_state=42)
classifier.fit(X_train_tfidf, Y_train)

# Make Predictions
y_pred = classifier.predict(X_test_tfidf)

# Evaluate the Model
accuracy = accuracy_score(Y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
precision = precision_score(Y_test, y_pred)
f1 = f1_score(Y_test, y_pred)
recall = recall_score(Y_test, y_pred)
confusion = confusion_matrix(Y_test, y_pred)
print("Precision: " , precision)
print("F1 score: ",f1)
print("Recall score: ", recall)
print("Confusion Matrix:\n ", confusion)

report = classification_report(Y_test, y_pred)
print("Classification Report:\n", report)

Accuracy: 0.7788
Precision:  0.7624326564442603
F1 score:  0.7848344264917613
Recall score:  0.8085924623667728
Confusion Matrix:
  [[6848 2293]
 [1742 7359]]
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.75      0.77      9141
           1       0.76      0.81      0.78      9101

    accuracy                           0.78     18242
   macro avg       0.78      0.78      0.78     18242
weighted avg       0.78      0.78      0.78     18242



## KN Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, accuracy_score, classification_report
classifier = KNeighborsClassifier(n_neighbors=5)

classifier.fit(X_train_tfidf, Y_train)

# Make Predictions
y_pred = classifier.predict(X_test_tfidf)

# Evaluate the Model
accuracy = accuracy_score(Y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

precision = precision_score(Y_test, y_pred)
f1 = f1_score(Y_test, y_pred)
recall = recall_score(Y_test, y_pred)
confusion = confusion_matrix(Y_test, y_pred)
print("Precision: " , precision)
print("F1 score: ",f1)
print("Recall score: ", recall)
print("Confusion Matrix:\n ", confusion)


report = classification_report(Y_test, y_pred)
print("Classification Report:\n", report)

Accuracy: 0.7939
Precision:  0.7780901801520358
F1 score:  0.7989734816082122
Recall score:  0.8210086803647951
Confusion Matrix:
  [[7010 2131]
 [1629 7472]]
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.77      0.79      9141
           1       0.78      0.82      0.80      9101

    accuracy                           0.79     18242
   macro avg       0.79      0.79      0.79     18242
weighted avg       0.79      0.79      0.79     18242



## Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, accuracy_score, classification_report

classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=.05, max_depth=5, random_state=42)


classifier.fit(X_train_tfidf, Y_train)

# Make Predictions
y_pred = classifier.predict(X_test_tfidf)

# Evaluate the Model
accuracy = accuracy_score(Y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
precision = precision_score(Y_test, y_pred)
f1 = f1_score(Y_test, y_pred)
recall = recall_score(Y_test, y_pred)
confusion = confusion_matrix(Y_test, y_pred)
print("Precision: " , precision)
print("F1 score: ",f1)
print("Recall score: ", recall)
print("Confusion Matrix:\n ", confusion)


report = classification_report(Y_test, y_pred)
print("Classification Report:\n", report)

Accuracy: 0.6451
Precision:  0.6105546671155626
F1 score:  0.6914498141263941
Recall score:  0.7970552686517965
Confusion Matrix:
  [[4514 4627]
 [1847 7254]]
Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.49      0.58      9141
           1       0.61      0.80      0.69      9101

    accuracy                           0.65     18242
   macro avg       0.66      0.65      0.64     18242
weighted avg       0.66      0.65      0.64     18242



## Adaboost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, accuracy_score, classification_report

classifier = AdaBoostClassifier(n_estimators=50, learning_rate=0.1, random_state=42)


classifier.fit(X_train_tfidf, Y_train)

# Make Predictions
y_pred = classifier.predict(X_test_tfidf)

# Evaluate the Model
accuracy = accuracy_score(Y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
precision = precision_score(Y_test, y_pred)
f1 = f1_score(Y_test, y_pred)
recall = recall_score(Y_test, y_pred)
confusion = confusion_matrix(Y_test, y_pred)
print("Precision: " , precision)
print("F1 score: ",f1)
print("Recall score: ", recall)
print("Confusion Matrix:\n ", confusion)


report = classification_report(Y_test, y_pred)
print("Classification Report:\n", report)

Accuracy: 0.6035
Precision:  0.564708327559928
F1 score:  0.6926704907584448
Recall score:  0.8956158663883089
Confusion Matrix:
  [[2858 6283]
 [ 950 8151]]
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.31      0.44      9141
           1       0.56      0.90      0.69      9101

    accuracy                           0.60     18242
   macro avg       0.66      0.60      0.57     18242
weighted avg       0.66      0.60      0.57     18242



## XGBoost

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, accuracy_score, classification_report

classifier = XGBClassifier()

classifier.fit(X_train_tfidf, Y_train)

# Make Predictions
y_pred = classifier.predict(X_test_tfidf)

# Evaluate the Model
accuracy = accuracy_score(Y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
precision = precision_score(Y_test, y_pred)
f1 = f1_score(Y_test, y_pred)
recall = recall_score(Y_test, y_pred)
confusion = confusion_matrix(Y_test, y_pred)
print("Precision: " , precision)
print("F1 score: ",f1)
print("Recall score: ", recall)
print("Confusion Matrix:\n ", confusion)


report = classification_report(Y_test, y_pred)
print("Classification Report:\n", report)

Accuracy: 0.6976
Precision:  0.6713520749665328
F1 score:  0.7179303645380644
Recall score:  0.771453686408087
Confusion Matrix:
  [[5704 3437]
 [2080 7021]]
Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.62      0.67      9141
           1       0.67      0.77      0.72      9101

    accuracy                           0.70     18242
   macro avg       0.70      0.70      0.70     18242
weighted avg       0.70      0.70      0.70     18242



In [None]:
# just to make sure not run below.
sadgfsadf

NameError: ignored

# Using Word2Vec

## Logistic Regression

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from gensim.models import Word2Vec
from gensim.models.doc2vec import TaggedDocument
from nltk.tokenize import word_tokenize
import nltk
from tqdm import tqdm

# Download NLTK resources if not already downloaded
nltk.download('punkt')

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(filtered_data['clean_tokenized_text'],filtered_data['label'], test_size=0.2, random_state=42)

# Tokenize the text using NLTK
tokenized_train = [word_tokenize(text.lower()) for text in tqdm(X_train, desc="Tokenizing Train Data")]
tokenized_test = [word_tokenize(text.lower()) for text in tqdm(X_test, desc="Tokenizing Test Data")]

# Train Word2Vec model
w2v_model = Word2Vec(sentences=tokenized_train, vector_size=10000, window=15, min_count=1, workers=4)

# Function to calculate document vectors using Word2Vec model
def calculate_doc_vector(model, tokenized_text):
    vector_sum = np.zeros(model.vector_size)
    for word in tokenized_text:
        if word in model.wv:
            vector_sum += model.wv[word]
    return vector_sum / len(tokenized_text)

# Create document vectors for training and testing sets
X_train_w2v = np.array([calculate_doc_vector(w2v_model, tokenized_text) for tokenized_text in tqdm(tokenized_train, desc="Creating Train Vectors")])
X_test_w2v = np.array([calculate_doc_vector(w2v_model, tokenized_text) for tokenized_text in tqdm(tokenized_test, desc="Creating Test Vectors")])

# Train a logistic regression classifier
clf = LogisticRegression(random_state=42)
clf.fit(X_train_w2v, y_train)

# Predictions on the test set
y_pred = clf.predict(X_test_w2v)

# Evaluate the model
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")


## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Train Decision Tree Classifier
clf = DecisionTreeClassifier()
clf.fit(X_train_w2v, y_train)

# Predictions on the test set
y_pred = clf.predict(X_test_w2v)

# Evaluate the model
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

## naive Bayes

In [None]:
from sklearn.naive_bayes import BernoulliNB

# Train Binary Naive Bayes Classifier
clf = BernoulliNB()
clf.fit(X_train_w2v, y_train)

# Predictions on the test set
y_pred = clf.predict(X_test_w2v)

# Evaluate the model
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_w2v, y_train)

# Predictions on the test set
y_pred = clf.predict(X_test_w2v)

# Evaluate the model
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

## SVM

In [None]:
from sklearn.svm import SVC

# Train SVM Classifier
clf = SVC(kernel='linear', C=1.0, random_state=42)
clf.fit(X_train_w2v, y_train)

# Predictions on the test set
y_pred = clf.predict(X_test_w2v)

# Evaluate the model
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

# Count Vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

# X_train, X_test, X_valid, Y_train, Y_test, Y_valid = data_handler.train_test_split()

vectorizer = CountVectorizer(ngram_range=(1, 3), vocabulary=list(vocab.keys())).fit(list(vocab.keys()))

X = vectorizer.transform(filtered_data['clean_tokenized_text']).toarray()
y = filtered_data['label'].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
X.shape, y.shape

In [None]:
vectorizer.get_feature_names_out()[1000:1005]

In [None]:
filtered_data.label.unique() # problem yehi ho.... only one label of data is there

## MultinomialNB

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Create a CountVectorizer to convert text to a bag-of-words
vectorizer = CountVectorizer()  # You can adjust max_features based on your dataset size

train_data, test_data, train_labels, test_labels = X_train, X_test, y_train, y_test

# Fit and transform on the training data
#train_features = vectorizer.fit_transform(train_data)

# Transform the test data
#test_features = vectorizer.fit_transform(test_data)

# Create a Naive Bayes classifier
classifier = MultinomialNB()

# Train the classifier
classifier.fit(train_data, train_labels)

# Make predictions on the test set
predictions = classifier.predict(test_data)

# Evaluate accuracy
accuracy = accuracy_score(test_labels, predictions)
print(f'Accuracy: {accuracy:.2f}')

# Display classification report and confusion matrix
print('\nClassification Report:')
print(classification_report(test_labels, predictions))

print('\nConfusion Matrix:')
print(confusion_matrix(test_labels, predictions))


In [None]:
# see now its different?

## Train Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression  # or any other classifier you prefer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, recall_score, precision_score

In [None]:
clf = LogisticRegression(random_state=0).fit(X_train, y_train)

train_score = clf.score(X_train, y_train)
test_score = clf.score(X_test, y_test)


print(f"Train Score: {train_score}, Test Score: {test_score}")

In [None]:
Y_pred = clf.predict(X_test)
f1 = f1_score(y_test, Y_pred)
recall = recall_score(y_test, Y_pred)
precision = precision_score(y_test, Y_pred)
print(f1)
print(recall)
print(precision)

## RandomForest

In [None]:
# Data Processing
import pandas as pd
import numpy as np

# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint

# Tree Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz

In [None]:
rf = RandomForestClassifier(bootstrap=True,criterion='entropy',)
rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
f1 = f1_score(y_test, y_pred)

recall = recall_score(y_test, y_pred)

precision = precision_score(y_test, y_pred)


print("F1 Score:", f1)
print("Recall:", recall)
print("Precision:", precision)

## SVM

In [None]:
from sklearn import model_selection, svm

In [None]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(X_train, y_train)
# predict labels
predictions_SVM = SVM.predict(X_test)
#get the accuracy
print("Accuracy: ",accuracy_score(predictions_SVM, y_test)*100)

In [None]:
f1 = f1_score(y_test, predictions_SVM)

recall = recall_score(y_test, predictions_SVM)
precision = precision_score(y_test, predictions_SVM)
print(f1)
print(recall)
print(precision)

## NaiveBayes

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
gnb = GaussianNB()
gnb.fit(X_train, y_train)

# making predictions on the testing set
y_pred = gnb.predict(X_test)

In [None]:
print("Gaussian Naive Bayes model accuracy(in %):", metrics.accuracy_score(y_test, y_pred)*100)

In [None]:
f1 = f1_score(Y_test, y_pred)


recall = recall_score(Y_test, y_pred)


precision = precision_score(Y_test, y_pred)
print(f1)
print(recall)
print(precision)

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf_gini = DecisionTreeClassifier(criterion='entropy', max_depth=10, random_state=0)
clf_gini.fit(X_train, y_train)

In [None]:
y_pred_gini = clf_gini.predict(X_test)


In [None]:
from sklearn.metrics import accuracy_score

print('Model accuracy score with criterion gini index: {0:0.4f}'. format(accuracy_score(y_test, y_pred_gini)))