In [1]:
import pandas as pd
from sklearn.utils import shuffle


In [2]:
df = pd.read_csv("C:/Users/hrith/Downloads/Compressed/dataset/hm_train.csv")
df = shuffle(df)
df.reset_index(inplace=True, drop=True)

In [4]:
from io import StringIO
col = ['cleaned_hm', 'predicted_category']
df = df[col]
df = df[pd.notnull(df['cleaned_hm'])]
df.columns = ['cleaned_hm', 'predicted_category']
df['category_id'] = df['predicted_category'].factorize()[0]
category_id_df = df[['predicted_category', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'predicted_category']].values)
df.head()

Unnamed: 0,cleaned_hm,predicted_category,category_id
0,Yesterday evening i played carrom board and ch...,affection,0
1,The University of North Carolina won the natio...,enjoy_the_moment,1
2,In the past three months one event that made m...,affection,0
3,One event that made me happy was riding roller...,enjoy_the_moment,1
4,I watched Moana with my family.,affection,0


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(df.cleaned_hm).toarray()
labels = df.category_id
features.shape

(60321, 18342)

In [10]:
from sklearn.feature_selection import chi2
import numpy as np
N = 2
for predicted_category, category_id in sorted(category_to_id.items()):
  features_chi2 = chi2(features, labels == category_id)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tfidf.get_feature_names())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
  print("# '{}':".format(predicted_category))
  print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
  print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))

# 'achievement':
  . Most correlated unigrams:
. friend
. job
  . Most correlated bigrams:
. new job
. bought new
# 'affection':
  . Most correlated unigrams:
. daughter
. family
  . Most correlated bigrams:
. came home
. year old
# 'bonding':
  . Most correlated unigrams:
. friends
. friend
  . Most correlated bigrams:
. old friend
. best friend
# 'enjoy_the_moment':
  . Most correlated unigrams:
. pizza
. ate
  . Most correlated bigrams:
. ate favorite
. ate delicious
# 'exercise':
  . Most correlated unigrams:
. workout
. gym
  . Most correlated bigrams:
. went yoga
. went gym
# 'leisure':
  . Most correlated unigrams:
. movie
. watched
  . Most correlated bigrams:
. went movie
. went temple
# 'nature':
  . Most correlated unigrams:
. rain
. weather
  . Most correlated bigrams:
. sun shining
. weather nice


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

X_train, X_test, y_train, y_test = train_test_split(df['cleaned_hm'], df['predicted_category'], random_state = 0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

# clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

from sklearn.model_selection import cross_val_score


models = [
    LinearSVC(),
    LogisticRegression(random_state=0),
]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])



In [14]:
cv_df.groupby('model_name').accuracy.mean()

model_name
LinearSVC             0.894150
LogisticRegression    0.880456
Name: accuracy, dtype: float64

In [15]:
from sklearn.model_selection import train_test_split

model = LinearSVC()

X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, df.index, test_size=0.3, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [16]:
model.fit(features, labels)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [17]:
df1 = pd.read_csv("C:/Users/hrith/Downloads/Compressed/dataset/hm_test.csv")
texts = df1['cleaned_hm']
list_ = []
text_features = tfidf.transform(texts)
predictions = model.predict(text_features)
for text, predicted in zip(texts, predictions):
  #print('"{}"'.format(text))
  #print("  - Predicted as: '{}'".format(id_to_category[predicted]))
  list_.append(id_to_category[predicted])
  #print("")

In [18]:
df1['results'] = list_
#print(df1.head())

In [19]:
new_df = pd.DataFrame(df1['hmid'], columns = ['hmid', 'predicted_category'])
new_df['predicted_category'] = df1['results']
#print(new_df.shape)
export_csv = new_df.to_csv (r'C:\Users\hrith\OneDrive\Desktop\results_final_submission.csv', index = None, header=True)
#print(new_df)


In [21]:
from sklearn import metrics
print(metrics.classification_report(y_test, y_pred, 
                                    target_names=df['predicted_category'].unique()))

                  precision    recall  f1-score   support

       affection       0.94      0.94      0.94      6289
enjoy_the_moment       0.77      0.72      0.74      1940
     achievement       0.88      0.92      0.90      6092
         bonding       0.95      0.94      0.94      1991
        exercise       0.89      0.85      0.87       222
          nature       0.82      0.79      0.80       349
         leisure       0.81      0.77      0.79      1214

       micro avg       0.89      0.89      0.89     18097
       macro avg       0.87      0.85      0.86     18097
    weighted avg       0.89      0.89      0.89     18097

