In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('../data/19-02-23.csv', index_col=0)

In [3]:
df.shape

(1196, 2)

In [4]:
cvec = CountVectorizer(strip_accents='unicode')

In [10]:
df = df.dropna()

In [11]:
X_train_raw, X_test_raw, y_train, y_test = train_test_split(df['text'], df['lang'])

In [12]:
X_train = cvec.fit_transform(X_train_raw)
X_test = cvec.transform(X_test_raw)

In [13]:
lr = LogisticRegression()

In [14]:
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [15]:
lr.score(X_test, y_test)

0.82758620689655171

In [18]:
df2 = pd.DataFrame({'feature':cvec.get_feature_names(), 'coef':lr.coef_[0]})

In [19]:
df2.sort_values('coef', ascending=True).head()

Unnamed: 0,feature,coef
1787,building,-1.197434
6640,over,-0.943134
5493,legislative,-0.9152
1073,an,-0.797812
6405,notes,-0.796856


In [29]:
test = pd.DataFrame({
    'text':X_test_raw,
    'lang':y_test
}).reset_index(drop=True)

In [72]:
rando = np.random.randint(0, test.shape[0])
print(test.loc[rando, 'text'])

In 2009, James Cameron's "Avatar" once again showed the appeal of sci-fi epics, with more than $700 million and more than $2.7 billion in box office across the US and worldwide, making it the most successful box office movie (no Consider the inflation factor). In recent years, many sci-fi epics have achieved varying degrees of success, including Peter Jackson's "The Lord of the Rings" movie trilogy, the Harry Potter series.


In [73]:
print("Computer predicts:")
print(lr.predict(cvec.transform([test.loc[rando, 'text']]))[0])

Computer predicts:
zh


In [74]:
print("Actually:")
print(test.loc[rando, 'lang'])

Actually:
zh


In [76]:
test['pred'] = lr.predict(cvec.transform(test['text']))

In [78]:
test[test['lang']!=test['pred']]

Unnamed: 0,text,lang,pred
3,Most of the space opera stories revolve around...,zh,en
6,There are a number of municipal expressways an...,en,zh
10,The Canadian Space Agency operates a highly ac...,en,zh
18,"During the American Revolutionary War, colonis...",zh,en
20,"Around 1000 AD, Europeans set foot in Canada f...",zh,en
23,The Americas are usually accepted as having be...,en,zh
27,"The plot focuses on the Rebel Alliance, led by...",zh,en
38,North America is a continent entirely within t...,en,zh
46,"In cities, money, services, wealth and opportu...",en,zh
60,Star Wars proved to be the most successful fil...,zh,en
