###Preprocessing the Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
!git clone https://github.com/khushboogupta13/Inter-IIT-Tech-Meet.git

Cloning into 'Inter-IIT-Tech-Meet'...
remote: Enumerating objects: 45, done.[K
remote: Counting objects: 100% (45/45), done.[K
remote: Compressing objects: 100% (37/37), done.[K
remote: Total 45 (delta 7), reused 32 (delta 3), pack-reused 0[K
Unpacking objects: 100% (45/45), done.


#####Reading the data of tweets and articles into a dataframe

In [3]:
data_tweets = pd.read_excel("/content/Inter-IIT-Tech-Meet/datasets/IIT tech meet 2021_BRIDGEi2i_NLP/Development Data/dev_data_tweet.xlsx")

In [4]:
data_tweets = data_tweets.drop_duplicates(subset = 'Tweet')

In [5]:
non_mobile_tech_df = data_tweets.loc[data_tweets['Mobile_Tech_Tag'] == 0]

In [6]:
non_mobile_tech_df

Unnamed: 0,Tweet_ID,Tweet,Mobile_Tech_Tag
0,tweet_0001,You'll 💜 my #PitchWars book if you like: 🦋 hat...,0
1,tweet_0002,RT @SkySportsNews: 🚨 Breaking: #WBA have reach...,0
2,tweet_0003,RT @stealyoman_cuso: really says a lot about s...,0
3,tweet_0004,RT @PGtzsche1: HPV vaccines increased serious ...,0
4,tweet_0005,Ramaphosa says if you are positive you must se...,0
...,...,...,...
2989,tweet_2990,@sahildwivedi857 @idesibanda @TheLallantop Pai...,0
2990,tweet_2991,@sumitgu73524312 @Bajaj_Finance Kya hi bol rah...,0
2992,tweet_2993,@CryptooAdy Ady bhi m thoda thoda kar lata hu ...,0
2995,tweet_2996,RT @TV9Bharatvarsh: होम लोन बांटने में #SBI ने...,0


In [7]:
mobile_tech_df = data_tweets.loc[data_tweets['Mobile_Tech_Tag'] == 1]

In [8]:
mobile_tech_df

Unnamed: 0,Tweet_ID,Tweet,Mobile_Tech_Tag
3000,tweet_3001,#Samsung is now making another addition into i...,1
3001,tweet_3002,QT @AwamiWeb: Want my hands on #GalaxyS21 🥺 ; ...,1
3002,tweet_3003,QT @AwamiWeb: This is a next level camera 🤩 #G...,1
3003,tweet_3004,Trick to get Google Home Mini at 499 Only With...,1
3004,tweet_3005,QT @MadhavSheth1: We are the pioneers of 64MP ...,1
...,...,...,...
3991,tweet_3992,#Realme ने 5G सेगमेंट में एक धांसू फोन #Realme...,1
3992,tweet_3993,"भारत में लॉन्च हुआ Xiaomi Mi 10i, यह है 2021 क...",1
3997,tweet_3998,चीनी मोबाइल कंपनी रियलमी ने अपना Realme V15 5G...,1
3998,tweet_3999,QT @AnjaliSingh_IN: Aapne phone hi galat choos...,1


#####Do Undersampling as the data is balanced

In [9]:
import random

tweet_ids = list(non_mobile_tech_df['Tweet_ID'])
tweet_ids_undersampled = random.sample(tweet_ids, 569)
non_mobile_tech_undersampled_df = non_mobile_tech_df[non_mobile_tech_df['Tweet_ID'].isin(tweet_ids_undersampled)]

In [10]:
new_df = pd.concat([non_mobile_tech_undersampled_df, mobile_tech_df])

#####Translating all the tweets irrespective of the language

In [11]:
pip install google_trans_new

Collecting google_trans_new
  Downloading https://files.pythonhosted.org/packages/f9/7b/9f136106dc5824dc98185c97991d3cd9b53e70a197154dd49f7b899128f6/google_trans_new-1.1.9-py3-none-any.whl
Installing collected packages: google-trans-new
Successfully installed google-trans-new-1.1.9


In [12]:
from google_trans_new import google_translator

translator = google_translator()

def trans(t):
  return translator.translate(t,lang_tgt='en') 

In [13]:
original_tweets = list(new_df['Tweet'])
translated_tweets = [trans(text) for text in original_tweets]
new_df['Tweet'] = new_df['Tweet'].replace(original_tweets, translated_tweets)

In [14]:
new_df

Unnamed: 0,Tweet_ID,Tweet,Mobile_Tech_Tag
0,tweet_0001,You'll 💜 my #PitchWars book if you like: 🦋 hat...,0
4,tweet_0005,Ramaphosa says if you are positive you must se...,0
5,tweet_0006,"@KarlBrophy 200,000 is a lot of the people tha...",0
7,tweet_0008,@EdwardthembaSa 😂 I don't think hz taking that...,0
8,tweet_0009,QT @ElevationToday: Kini gbogbo eleyi bayi? E ...,0
...,...,...,...
3991,tweet_3992,#Realme has launched a Dhansu Phone # Realmev1...,1
3992,tweet_3993,"Xiaomi Mi 10i launched in India, it is the che...",1
3997,tweet_3998,Chinese mobile company Realmi has launched its...,1
3998,tweet_3999,Qt @anjalisingh_in: aapne phone hi galat choos...,1


#####Create a list of all the words

In [15]:
all_words = []
texts = list(new_df['Tweet'])
for i in texts:
  t = list(i.split())
  all_words += t

In [16]:
all_words = set(all_words)

#####Filter words

In [17]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [19]:
def clean(word):
  word = re.sub(r"\d", "", word)
  word = re.sub(r'[^\x00-\x7f]', r' ', word)
  if ('@' in word) or ("http" in word):
    return ''
  if word in stopwords.words('english'):
    return '' 
  if (word.lower() == "rt") or (word.lower() == "qt"):
    return '' 
  word = re.sub(r'[^\w\s]', "", word)
  return word

In [20]:
all_words = [clean(w.strip()) for w in all_words if len(clean(w.strip())) > 0]

#####Cleaning the tweets

In [21]:
def clean_sentence(text):
  t = list(text.split())
  t = [i.lower() for i in t if i in all_words]
  return " ".join(t)

In [22]:
tweets = list(new_df['Tweet'])
cleaned_tweets = [clean_sentence(i) for i in tweets]
new_df['Tweet'] = new_df['Tweet'].replace(tweets, cleaned_tweets)

#####Tokenization, Stemming, Making bag of words and Preparing Training Data

In [23]:
nltk.download('punkt')
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def tokenize(sentence):
    return nltk.word_tokenize(sentence)

def stem(word):
    return stemmer.stem(word.lower())

def bag_of_words(sentence):
    tokenized_sentence = tokenize(sentence)
    sentence_words = [stem(word) for word in tokenized_sentence]
    bag = np.zeros(len(all_words), dtype=np.float32)
    for idx, w in enumerate(all_words):
        if w in sentence_words: 
            bag[idx] = 1
    return bag  

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [24]:
tweets = list(new_df['Tweet'])
X_train = [bag_of_words(i) for i in tweets]
Y_train = list(new_df['Mobile_Tech_Tag'])

In [25]:
X_train = np.array(X_train)
Y_train = np.array(Y_train)

#####Training

In [26]:
num_epochs = 10
batch_size = 8
learning_rate = 0.001
input_size = len(X_train[0])
output_size = 2

In [27]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [28]:
class ChatDataset(Dataset):

    def __init__(self):
        self.n_samples = len(X_train)
        self.x_data = X_train
        self.y_data = Y_train

    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        return self.n_samples

In [29]:
dataset = ChatDataset()

train_loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, num_workers=0)

In [30]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [31]:
class NeuralNet(nn.Module):
    def __init__(self, input_size, num_classes):
        super(NeuralNet, self).__init__()
        self.l1 = nn.Linear(input_size, 1024) 
        self.l2 = nn.Linear(1024, 256) 
        self.l3 = nn.Linear(256, 64)
        self.l4 = nn.Linear(64, 16)
        self.l5 = nn.Linear(16, num_classes)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        out = self.l1(x)
        out = self.relu(out)
        out = self.l2(out)
        out = self.relu(out)
        out = self.l3(out)
        out = self.relu(out)
        out = self.l4(out)
        out = self.relu(out)
        out = self.l5(out)
        return out

In [32]:
model = NeuralNet(input_size, output_size).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [33]:
for epoch in range(num_epochs):
    for (w, labels) in train_loader:
        w = w.to(device)
        labels = labels.to(dtype=torch.long).to(device)
        
        outputs = model(w)

        loss = criterion(outputs, labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    print (f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/10], Loss: 0.0007
Epoch [2/10], Loss: 0.0124
Epoch [3/10], Loss: 0.0157
Epoch [4/10], Loss: 0.0000
Epoch [5/10], Loss: 0.0000
Epoch [6/10], Loss: 0.0000
Epoch [7/10], Loss: 0.0000
Epoch [8/10], Loss: 0.0000
Epoch [9/10], Loss: 0.0000
Epoch [10/10], Loss: 0.0000


In [36]:
data = {
"model_state": model.state_dict(),
"input_size": input_size,
"output_size": output_size,
"all_words": all_words,
"tags": [0, 1]
}

In [37]:
FILE = "tweets_classification.pth"
torch.save(data, FILE)

#####Predict the label

In [38]:
def predict(tweet):
    tweet = trans(tweet)
    tweet = clean_sentence(tweet)
    X = bag_of_words(tweet)
    X = np.array(X)
    X = torch.from_numpy(X).to(device)
    output = model(X)
    _, predicted = torch.max(output, dim=0)
    if output[0].item() > -2:
      return 0
    return predicted.item()

In [45]:
t_ids = list(data_tweets['Tweet_ID'])
t_ids = random.sample(t_ids, 50)
temp_df = data_tweets[data_tweets['Tweet_ID'].isin(t_ids)]
test_tweets = list(temp_df['Tweet'])
test_labels = list(temp_df['Mobile_Tech_Tag'])
total_tweets = len(test_tweets)
correct = 0
for t in range(total_tweets):
  p = predict(test_tweets[t])
  if p == test_labels[t]:
    correct += 1
print(correct)

49
