In [None]:
!pip install transformers

**BERT CLASSIFICATION - REDDIT TOPICS ( 5 )**

In [None]:
# BERT CLASSIFICATION REPORT 5
import pandas as pd
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

data = pd.read_pickle('irdata_50k_classifyvalues.pkl')
texts = data['message'].tolist()
labels = data['subreddit'].tolist()

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
encoded_inputs = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors='pt')

input_ids = encoded_inputs['input_ids']
attention_masks = encoded_inputs['attention_mask']
labels = torch.tensor(labels)

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)  # 5 labels

optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
dataset = torch.utils.data.TensorDataset(input_ids, attention_masks, labels)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

In [None]:
model.train()
for epoch in range(1):
    for batch in dataloader:
        input_ids_batch, attention_masks_batch, labels_batch = tuple(t.to(device) for t in batch)

        outputs = model(input_ids_batch, attention_mask=attention_masks_batch, labels=labels_batch)

        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

model.eval()

In [None]:
def predict_label(text):
    encoded_input = tokenizer.encode_plus(text, add_special_tokens=True, padding=True, truncation=True, max_length=128, return_tensors='pt')
    input_ids = encoded_input['input_ids'].to(device)
    attention_mask = encoded_input['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    predicted_label = torch.argmax(outputs.logits, dim=1).item()
    return predicted_label

In [None]:
x = {"Education" : 0, "Environment" : 1, "Healthcare" : 2, 'Politics' : 3, 'Technology' : 4}

In [None]:
new_text = "The quality of schools are getting better in recent times"
predicted_label = predict_label(new_text)
print("Input Text : ",new_text)
print("Context :", list(x.keys())[list(x.values()).index(predicted_label)])

Input Text :  The quality of schools are getting better in recent times
Context : Education


In [None]:
new_text = "Soil erosion is a serious issue"
predicted_label = predict_label(new_text)
print("Input Text : ",new_text)
print("Context :", list(x.keys())[list(x.values()).index(predicted_label)])

Input Text :  Soil erosion is a serious issue
Context : Environment


In [None]:
new_text = "Surgeries are very expensive"
predicted_label = predict_label(new_text)
print("Input Text : ",new_text)
print("Context :", list(x.keys())[list(x.values()).index(predicted_label)])

Input Text :  Surgeries are very expensive
Context : Healthcare


In [None]:
new_text = "Elections are next month"
predicted_label = predict_label(new_text)
print("Input Text : ",new_text)
print("Context :", list(x.keys())[list(x.values()).index(predicted_label)])

Input Text :  Elections are next month
Context : Politics


In [None]:
new_text = "Natural language processing is the latest trend"
predicted_label = predict_label(new_text)
print("Input Text : ",new_text)
print("Context :", list(x.keys())[list(x.values()).index(predicted_label)])

Input Text :  Natural language processing is the latest trend
Context : Technology


**BERT CLASSIFICATION - EMPATHIC CONTEXT ( 32 )**

In [None]:
# BERT CLASSIFICATION - EMPATHIC
import pandas as pd
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

data = pd.read_pickle('empathicdatafullclassify.pkl')
texts = data['message'].tolist()
labels = data['context'].tolist()

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
encoded_inputs = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors='pt')

input_ids = encoded_inputs['input_ids']
attention_masks = encoded_inputs['attention_mask']
labels = torch.tensor(labels)

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=32)  # 32 labels

optimizer = AdamW(model.parameters(), lr=2e-5)

dataset = torch.utils.data.TensorDataset(input_ids, attention_masks, labels)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

In [None]:
model.train()
for epoch in range(1):
    for batch in dataloader:
        input_ids_batch, attention_masks_batch, labels_batch = tuple(t.to(device) for t in batch)

        outputs = model(input_ids_batch, attention_mask=attention_masks_batch, labels=labels_batch)

        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

model.eval()

In [None]:
def predict_label(text):
    encoded_input = tokenizer.encode_plus(text, add_special_tokens=True, padding=True, truncation=True, max_length=128, return_tensors='pt')
    input_ids = encoded_input['input_ids'].to(device)
    attention_mask = encoded_input['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    predicted_label = torch.argmax(outputs.logits, dim=1).item()
    return predicted_label


In [None]:
x = {'Afraid': 0,
 'Angry': 1,
 'Annoyed': 2,
 'Anticipating': 3,
 'Anxious': 4,
 'Apprehensive': 5,
 'Ashamed': 6,
 'Caring': 7,
 'Confident': 8,
 'Content': 9,
 'Devastated': 10,
 'Disappointed': 11,
 'Disgusted': 12,
 'Embarrassed': 13,
 'Excited': 14,
 'Faithful': 15,
 'Furious': 16,
 'Grateful': 17,
 'Guilty': 18,
 'Hopeful': 19,
 'Impressed': 20,
 'Jealous': 21,
 'Joyful': 22,
 'Lonely': 23,
 'Nostalgic': 24,
 'Prepared': 25,
 'Proud': 26,
 'Sad': 27,
 'Sentimental': 28,
 'Surprised': 29,
 'Terrified': 30,
 'Trusting': 31}

In [None]:
new_text = "I am excited for tomorrow's match"
predicted_label = predict_label(new_text)
print("Input Text : ",new_text)
print("Context :", list(x.keys())[list(x.values()).index(predicted_label)])

Input Text :  I am excited for tomorrow's match
Context : Anticipating


In [None]:
new_text = "I am ready to face any challenges"
predicted_label = predict_label(new_text)
print("Input Text : ",new_text)
print("Context :", list(x.keys())[list(x.values()).index(predicted_label)])

Input Text :  I am ready to face any challenges
Context : Prepared


In [None]:
new_text = "I made a very big mistake"
predicted_label = predict_label(new_text)
print("Input Text : ",new_text)
print("Context :", list(x.keys())[list(x.values()).index(predicted_label)])

Input Text :  I made a very big mistake
Context : Guilty


In [None]:
new_text = "I dont think I'll pass the exam"
predicted_label = predict_label(new_text)
print("Input Text : ",new_text)
print("Context :", list(x.keys())[list(x.values()).index(predicted_label)])

Input Text :  I dont think I'll pass the exam
Context : Apprehensive


In [None]:
new_text = "Those days were amazing"
predicted_label = predict_label(new_text)
print("Input Text : ",new_text)
print("Context :", list(x.keys())[list(x.values()).index(predicted_label)])

Input Text :  Those days were amazing
Context : Nostalgic


In [None]:
new_text = "I am content about my success"
predicted_label = predict_label(new_text)
print("Input Text : ",new_text)
print("Context :", list(x.keys())[list(x.values()).index(predicted_label)])

Input Text :  I am content about my success
Context : Content


In [None]:
new_text = "A bear is chasing me"
predicted_label = predict_label(new_text)
print("Input Text : ",new_text)
print("Context :", list(x.keys())[list(x.values()).index(predicted_label)])

Input Text :  A bear is chasing me
Context : Terrified


In [None]:
new_text = "I feel very very angry"
predicted_label = predict_label(new_text)
print("Input Text : ",new_text)
print("Context :", list(x.keys())[list(x.values()).index(predicted_label)])

Input Text :  I feel very very angry
Context : Furious


**BERT CLASSIFICATION - TOPIC ( 3 )**

In [None]:
# BERT CLASSIFICATION - TOPIC
import pandas as pd
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

data = pd.read_pickle('classifydatavalues.pkl')
texts = data['message'].tolist()
labels = data['topic'].tolist()

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
encoded_inputs = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors='pt')

input_ids = encoded_inputs['input_ids']
attention_masks = encoded_inputs['attention_mask']
labels = torch.tensor(labels)

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)  # 3 labels

optimizer = AdamW(model.parameters(), lr=2e-5)

dataset = torch.utils.data.TensorDataset(input_ids, attention_masks, labels)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

In [None]:
model.train()
for epoch in range(1):
    for batch in dataloader:
        input_ids_batch, attention_masks_batch, labels_batch = tuple(t.to(device) for t in batch)

        outputs = model(input_ids_batch, attention_mask=attention_masks_batch, labels=labels_batch)

        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

model.eval()

In [None]:
def predict_label(text):
    encoded_input = tokenizer.encode_plus(text, add_special_tokens=True, padding=True, truncation=True, max_length=128, return_tensors='pt')
    input_ids = encoded_input['input_ids'].to(device)
    attention_mask = encoded_input['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    predicted_label = torch.argmax(outputs.logits, dim=1).item()
    return predicted_label


In [None]:
x = {"ChitChat" : 0, "Empathic" : 1, "Reddit" : 2}

In [None]:
new_text = "Soil erosion is the cause of global warming"
predicted_label = predict_label(new_text)
print("Input Text : ", new_text)
print("Topic : ", list(x.keys())[list(x.values()).index(predicted_label)])

Input Text :  Soil erosion is the cause of global warming
Topic :  Reddit


In [None]:
new_text = "I have a exam tomorrow"
predicted_label = predict_label(new_text)
print("Input Text : ", new_text)
print("Topic : ", list(x.keys())[list(x.values()).index(predicted_label)])

Input Text :  I have a exam tomorrow
Topic :  ChitChat


In [None]:
new_text = "I am looking forward to tomorrow's match"
predicted_label = predict_label(new_text)
print("Input Text : ", new_text)
print("Topic : ", list(x.keys())[list(x.values()).index(predicted_label)])

Input Text :  I am looking forward to tomorrow's match
Topic :  Empathic


In [None]:
new_text = "How is the weather today"
predicted_label = predict_label(new_text)
print("Input Text : ", new_text)
print("Topic : ", list(x.keys())[list(x.values()).index(predicted_label)])

Input Text :  How is the weather today
Topic :  ChitChat


In [None]:
new_text = "I made a very big mistake"
predicted_label = predict_label(new_text)
print("Input Text : ", new_text)
print("Topic : ", list(x.keys())[list(x.values()).index(predicted_label)])

Input Text :  I made a very big mistake
Topic :  Empathic


In [None]:
new_text = "Surgeries are very expensive in US"
predicted_label = predict_label(new_text)
print("Input Text : ", new_text)
print("Topic : ", list(x.keys())[list(x.values()).index(predicted_label)])

Input Text :  Surgeries are very expensive in US
Topic :  Reddit


**NAIVE BAYES CLASSIFICATION -  REDDIT ( 5 )**

In [1]:
import pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

with open('irdata_50k.pkl', 'rb') as f:
    data = pickle.load(f)

df = pd.DataFrame(data, columns=['prompt', 'subreddit'])

X_train, X_test, y_train, y_test = train_test_split(df['prompt'], df['subreddit'], test_size=0.2)

tfidf = TfidfVectorizer(max_features=1000)

X_train_tfidf = tfidf.fit_transform(X_train)

clf = MultinomialNB()
clf.fit(X_train_tfidf, y_train)

X_test_tfidf = tfidf.transform(X_test)

y_pred = clf.predict(X_test_tfidf)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

   education       0.77      0.84      0.80      1990
 environment       0.68      0.67      0.68      1991
  healthcare       0.82      0.82      0.82      2036
    politics       0.95      0.65      0.77      2041
  technology       0.57      0.72      0.64      1942

    accuracy                           0.74     10000
   macro avg       0.76      0.74      0.74     10000
weighted avg       0.76      0.74      0.74     10000



**NAIVE BAYES CLASSIFICATION - EMPATHIC CONTEXT ( 32 )**



In [2]:
import pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

with open('empathicdatafull.pkl', 'rb') as f:
    data = pickle.load(f)

df = pd.DataFrame(data, columns=['prompt', 'context'])

X_train, X_test, y_train, y_test = train_test_split(df['prompt'], df['context'], test_size=0.2)

tfidf = TfidfVectorizer(max_features=1000)

X_train_tfidf = tfidf.fit_transform(X_train)

clf = MultinomialNB()
clf.fit(X_train_tfidf, y_train)

X_test_tfidf = tfidf.transform(X_test)

y_pred = clf.predict(X_test_tfidf)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      afraid       0.50      0.43      0.46       656
       angry       0.41      0.33      0.37       721
     annoyed       0.44      0.53      0.48       731
anticipating       0.47      0.42      0.44       611
     anxious       0.47      0.42      0.44       639
apprehensive       0.54      0.36      0.43       501
     ashamed       0.57      0.27      0.37       544
      caring       0.53      0.53      0.53       559
   confident       0.52      0.54      0.53       647
     content       0.67      0.66      0.66       628
  devastated       0.54      0.33      0.41       568
disappointed       0.46      0.41      0.43       658
   disgusted       0.53      0.60      0.56       645
 embarrassed       0.56      0.67      0.61       587
     excited       0.46      0.53      0.49       815
    faithful       0.63      0.52      0.57       410
     furious       0.55      0.39      0.46       633
    grateful       0.62    

**NAIVE BAYES CLASSIFICATION - TOPIC ( 3 )**

In [4]:
import pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

with open('classifydata.pkl', 'rb') as f:
    data = pickle.load(f)

df = pd.DataFrame(data, columns=['message', 'topic'])

X_train, X_test, y_train, y_test = train_test_split(df['message'], df['topic'], test_size=0.2)

tfidf = TfidfVectorizer(max_features=1000)

X_train_tfidf = tfidf.fit_transform(X_train)

clf = MultinomialNB()
clf.fit(X_train_tfidf, y_train)

X_test_tfidf = tfidf.transform(X_test)

y_pred = clf.predict(X_test_tfidf)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    chitchat       0.65      0.75      0.70      3953
    empathic       0.74      0.76      0.75      4064
      reddit       0.86      0.72      0.78      4079

    accuracy                           0.74     12096
   macro avg       0.75      0.74      0.74     12096
weighted avg       0.75      0.74      0.74     12096

