# text classification 

In [1]:
import pandas as pd 
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression



In [2]:
file_paths_dict = {"amazon":"data/amazon_cells_labelled.txt",
                   "imdb":"data/imdb_labelled.txt",
                   "yelp":"data/yelp_labelled.txt"}

**preparing the data**

In [3]:
dflist = []
for source , path in file_paths_dict.items():
    df = pd.read_csv(path,names=["sentence","label"],sep="\t")
    df["source"] = source
    dflist.append(df)
df = pd.concat(dflist)
df

Unnamed: 0,sentence,label,source
0,So there is no way for me to plug it in here i...,0,amazon
1,"Good case, Excellent value.",1,amazon
2,Great for the jawbone.,1,amazon
3,Tied to charger for conversations lasting more...,0,amazon
4,The mic is great.,1,amazon
...,...,...,...
995,I think food should have flavor and texture an...,0,yelp
996,Appetite instantly gone.,0,yelp
997,Overall I was not impressed and would not go b...,0,yelp
998,"The whole experience was underwhelming, and I ...",0,yelp


# Amazon Data

**spliting the data**

In [4]:
df_amazon = df[df["source"]=="amazon"]

In [5]:
X=df_amazon["sentence"].values
y=df_amazon["label"].values

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


In [7]:
from sklearn.feature_extraction.text import CountVectorizer 

**data preprocessing**

In [8]:
vectorizer = CountVectorizer()
vectorizer.fit(X_train)

CountVectorizer()

In [9]:
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

**build and train model**

In [10]:
model = LogisticRegression()
model.fit(X_train,y_train)
score = model.score(X_test,y_test)

print ("Accuracy of amazon data : ", score)

Accuracy of amazon data :  0.803030303030303


# imdb

In [11]:
df_imdb = df[df["source"]=="imdb"]

In [12]:
X=df_imdb["sentence"].values
y=df_imdb["label"].values

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


In [14]:
vectorizer = CountVectorizer()
vectorizer.fit(X_train)

CountVectorizer()

In [15]:
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

In [16]:
model = LogisticRegression()
model.fit(X_train,y_train)
score = model.score(X_test,y_test)

print ("Accuracy of imdb data : ", score)

Accuracy of imdb data :  0.7246963562753036


# yelp 

In [17]:
df_yelp = df[df["source"]=="yelp"]

In [18]:
X=df_yelp["sentence"].values
y=df_yelp["label"].values

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


In [20]:
vectorizer = CountVectorizer()
vectorizer.fit(X_train)

CountVectorizer()

In [21]:
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

In [22]:
model = LogisticRegression()
model.fit(X_train,y_train)
score = model.score(X_test,y_test)

print ("Accuracy of yelp data : ", score)

Accuracy of yelp data :  0.796969696969697
