In [5]:
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from scipy.sparse import hstack
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.model_selection import train_test_split

all_data = pd.read_csv('data/train.csv')
data = all_data.dropna()

x_train, x_test, y_train, y_test = train_test_split(data[['text', 'title']], data['label'], test_size=0.2, random_state=38923)

In [21]:
tokenizer = TfidfVectorizer(stop_words='english')


vectors = []
x = [x_train['text'], x_train['title']]
for d in x:
    vectorizer = TfidfVectorizer(stop_words='english')
    vectors.append(vectorizer.fit_transform(d))
vector = hstack(vectors)

svd_model = TruncatedSVD(1000)
vector = svd_model.fit_transform(vector)

scl = StandardScaler()
vector = scl.fit_transform(vector)

In [22]:
np.mean(cross_val_score(LogisticRegression(penalty='l1', solver='liblinear'), vector, y_train))

0.971903488324541

In [25]:
np.mean(cross_val_score(LogisticRegression(penalty='l1', solver='liblinear', C=10), vector, y_train))

0.9688958526853264

In [26]:
np.mean(cross_val_score(LogisticRegression(penalty='l1', solver='liblinear', C=0.1), vector, y_train))

0.974364559417191

In [27]:
np.mean(cross_val_score(LogisticRegression(penalty='l1', solver='liblinear', C=0.01), vector, y_train))

0.9477717604033394

In [28]:
np.mean(cross_val_score(LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.75, n_jobs=-1), vector, y_train))

0.97251889630837

In [29]:
for c in [0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 0.05, 0.025, 0.01]:
    print(np.mean(cross_val_score(LogisticRegression(penalty='l1', solver='liblinear', C=c), vector, y_train)))

0.971903488324541
0.9721769692296007
0.9723820039609514
0.9723136278925752
0.9725870620607463
0.9728605897026948
0.9734759042127463
0.9742963702963703
0.974364559417191
0.9707411886359255
0.9638368648894964
0.9479084658032025


In [31]:
for c in [0.01, 0.05, 0.1, 0.2, 0.35]:
    for l1 in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
        print(f'c={c}, l1={l1}, accuracy = ')
        print(np.mean(cross_val_score(LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=l1, C=c, n_jobs=-1), vector, y_train)))

c=0.01, l1=0.1, accuracy = 
0.9751846983425931
c=0.01, l1=0.2, accuracy = 
0.9731339070286438
c=0.01, l1=0.3, accuracy = 
0.9688270559849507
c=0.01, l1=0.4, accuracy = 
0.9665711364658733
c=0.01, l1=0.5, accuracy = 
0.9628796233006758
c=0.01, l1=0.6, accuracy = 
0.9596665556665556
c=0.01, l1=0.7, accuracy = 
0.956863814548025
c=0.01, l1=0.8, accuracy = 
0.9535825577930842
c=0.01, l1=0.9, accuracy = 
0.9511898160319212
c=0.05, l1=0.1, accuracy = 
0.9755950482266271
c=0.05, l1=0.2, accuracy = 
0.9759367182525077
c=0.05, l1=0.3, accuracy = 
0.9758684122894647
c=0.05, l1=0.4, accuracy = 
0.9753215205846786
c=0.05, l1=0.5, accuracy = 
0.9758683889210206
c=0.05, l1=0.6, accuracy = 
0.9746380169538064
c=0.05, l1=0.7, accuracy = 
0.9735442335442336
c=0.05, l1=0.8, accuracy = 
0.9725871087976351
c=0.05, l1=0.9, accuracy = 
0.9719035584298743
c=0.1, l1=0.1, accuracy = 
0.9741595246858404
c=0.1, l1=0.2, accuracy = 
0.9751849787639261
c=0.1, l1=0.3, accuracy = 
0.9749797804534648
c=0.1, l1=0.4, ac