In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from transformers import AutoTokenizer,AutoModelForSequenceClassification
from tensorflow.keras.layers import Dense,Input,GlobalMaxPooling1D,LSTM,Embedding,Conv1D,Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns

In [2]:
train_data=pd.read_csv('/kaggle/input/anyas-gojo-revival-space-shop/train.tsv', sep='\t')
train_data.head()

Unnamed: 0,date,time,id,reviewer name,reviewer address,contact number,variation,verified_reviews,sentiment,feedback
0,30-Jul-18,02:16:00,5138527,Theresa Diaz,"2373 Long Mews\nSouth Lorraine, FL 83715",7445448464,Black Plus,Love my new Echo Plus,5,1
1,30-Jul-18,20:13:24,7881409,Michael Wagner,"34120 James Well Apt. 930\nSouth Kenneth, AK 5...",(202)233-6662,Configuration: Fire TV Stick,Works really well.,5,1
2,29-Jul-18,03:20:48,5658396,Erin Rodriguez,"7996 Booth Mission\nTaylorport, AL 13457",(219)882-0705,Black Show,Echo Show is said to work with certain apps bu...,2,0
3,30-Jul-18,22:48:21,4317957,Christine Perez,USS Schaefer\nFPO AA 86432,291-900-6087,Heather Gray Fabric,Very cool product. Speaker sounds good with my...,5,1
4,23-Jul-18,12:36:42,7570569,Jamie Davila,"000 Kevin Cliffs Suite 674\nBrennanview, NM 77690",9527296610,Black Plus,So far I love it. It was easy to set up - if ...,5,1


In [3]:
train_data.isnull().sum()

date                0
time                0
id                  0
reviewer name       0
reviewer address    0
contact number      0
variation           0
verified_reviews    1
sentiment           0
feedback            0
dtype: int64

In [4]:
train_data.fillna({"verified_reviews":"no review"}, inplace=True)

In [5]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Downloading (…)lve/main/config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [6]:
from scipy.special import softmax
pov_review=[]
neg_review=[]
neu_review=[]
for i in train_data['verified_reviews']:
    try:
        encoded_text = tokenizer(i, return_tensors='pt')
        output = model(**encoded_text)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
#         print(scores)
        neg_review.append(scores[0])
        neu_review.append(scores[1])
        pov_review.append(scores[2])
    except RuntimeError:
        print(1)
        pov_review.append(1.0)
        neg_review.append(0.0)
        neu_review.append(0.0)

1
1


In [7]:
new_dataframe=pd.DataFrame({"positive":pov_review,"negative":neg_review,"neutral":neu_review,"feedback":train_data['feedback']})
new_dataframe.head()

Unnamed: 0,positive,negative,neutral,feedback
0,0.989508,0.001466,0.009026,1
1,0.936137,0.004737,0.059127,1
2,0.107922,0.570545,0.321533,0
3,0.988388,0.001345,0.010267,1
4,0.991431,0.002093,0.006476,1


In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(new_dataframe, train_data['sentiment'], random_state = 0) 
dtree_model = RandomForestClassifier(n_estimators=1000, random_state=42).fit(X_train, y_train) 
dtree_predictions = dtree_model.predict(X_test) 

In [9]:
from sklearn.metrics import confusion_matrix 
accuracy = dtree_model.score(X_test, y_test)
print(accuracy)

0.8126984126984127


In [10]:
test_data=pd.read_csv('/kaggle/input/anyas-gojo-revival-space-shop/test.tsv', sep='\t')
pov_test_review=[]
neg_test_review=[]
neu_test_review=[]
for i in test_data['verified_reviews']:
    try:
        encoded_text = tokenizer(i, return_tensors='pt')
        output = model(**encoded_text)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
#         print(scores)
        neg_test_review.append(scores[0])
        neu_test_review.append(scores[1])
        pov_test_review.append(scores[2])
    except RuntimeError:
        print(1)
        pov_test_review.append(1.0)
        neg_test_review.append(0.0)
        neu_test_review.append(0.0)

In [11]:
test_dataframe=pd.DataFrame({"positive":pov_test_review,"negative":neg_test_review,"neutral":neu_test_review,"feedback":test_data['feedback']})
test_dataframe.head()

Unnamed: 0,positive,negative,neutral,feedback
0,0.984899,0.002404,0.012697,1
1,0.986175,0.003311,0.010514,1
2,0.005175,0.963042,0.031783,0
3,0.858581,0.032428,0.108991,1
4,0.963731,0.003297,0.032972,1


In [12]:
arr=dtree_model.predict(test_dataframe)
print(arr)

[5 5 1 5 5 5 5 5 1 3 5 5 5 4 5 5 1 5 3 4 5 5 5 5 5 3 1 5 5 1 5 5 5 5 5 5 5
 5 5 5 5 1 5 5 5 5 5 5 5 5 1 5 5 5 5 5 5 5 5 5 5 5 5 5 4 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 1 5 5 5 5 1 5 3 5 5 5 5 2 5 5 5 4 5 5 5 5 5 4 1 5 1 5 4 5 5 5 5
 5 2 5 5 5 5 5 5 5 5 4 5 5 3 5 5 5 5 5 5 5 5 5 5 5 5 4 5 5 5 5 5 5 5 5 5 5
 5 3 5 5 4 5 5 5 5 4 5 2 5 5 3 4 5 5 5 5 5 5 5 5 5 5 4 5 5 2 5 5 5 5 5 2 5
 2 5 5 4 5 5 5 5 5 5 5 4 5 5 5 5 5 5 4 5 5 3 1 5 5 4 5 5 5 4 5 5 4 5 5 5 5
 5 5 5 5 5 5 1 5 5 5 5 5 5 1 5 5 5 5 5 5 5 5 3 5 5 3 3 1 1 5 4 5 5 5 2 5 5
 5 4 3 5 5 5 4 5 5 5 5 3 5 5 1 5 5 5 5 3 5 4 3 3 5 5 5 5 5 4 4 4 5 5 5 5 5
 5 5 5 5 5 5 1 5 5 5 5 5 5 5 5 5 5 1 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 1 3 4 5 5 5 5 5 1 5 5 5 5 5 1 5 5 5 4 4 4 5 5 4 5 5 5 4
 5 5 5 5 2 5 5 5 5 5 1 5 5 5 5 5 5 1 1 3 5 5 5 5 2 3 5 5 1 5 5 5 5 5 5 5 5
 5 5 5 5 4 5 4 5 5 5 5 2 5 3 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 4 2 3 5
 3 5 5 4 5 2 5 5 5 1 4 5 5 5 5 5 5 5 5 4 5 5 1 5 5 5 5 5 5 5 5 4 2 5 5 5 5
 5 5 5 3 5 5 5 5 4 1 4 5 

In [13]:
submission=pd.DataFrame({"id":test_data['id'],"sentiment":arr})
submission.to_csv("submission6.csv",index=False)