In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from scipy import sparse
from sklearn.metrics import log_loss
from sklearn.ensemble import RandomForestClassifier

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
train_data = pd.read_csv('/kaggle/input/feedback-prize-effectiveness/train.csv')
test_data = pd.read_csv('/kaggle/input/feedback-prize-effectiveness/test.csv')

train_data['text'] = train_data['essay_id'].apply(lambda x: open(f'/kaggle/input/feedback-prize-effectiveness/train/{x}.txt').read())
test_data['text'] = test_data['essay_id'].apply(lambda x: open(f'/kaggle/input/feedback-prize-effectiveness/test/{x}.txt').read())
train_data.head(2)

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,text
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate,"Hi, i'm Isaac, i'm going to be writing about h..."
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate,"Hi, i'm Isaac, i'm going to be writing about h..."


In [3]:
data_label = {"Ineffective":0, "Adequate":1,"Effective":2}

In [4]:
train_data["target"] = train_data["discourse_effectiveness"].map(data_label)
train_data = train_data.reset_index(drop=True)
train_data.head(1)

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,text,target
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate,"Hi, i'm Isaac, i'm going to be writing about h...",1


In [5]:
skfold = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)

In [6]:
for i,(train_index, test_index) in enumerate(skfold.split(train_data, train_data["target"])):
    train_data.loc[test_index,"fold"] = i

In [7]:
print(train_data.fold.value_counts())

0.0    7353
1.0    7353
2.0    7353
4.0    7353
3.0    7353
Name: fold, dtype: int64


In [8]:
tf_idf = TfidfVectorizer(ngram_range=(1,3),norm='l2', smooth_idf=True)
one_hot_encode = OneHotEncoder()
preds = []

In [9]:
for fold in range(5):
    train_data_ = train_data[train_data['fold']!=fold]
    eval_data_ = train_data[train_data['fold']==fold]
    
    train_discourse_tfidf = tf_idf.fit_transform(train_data_["discourse_text"])
    eval_discourse_tfidf = tf_idf.transform(eval_data_["discourse_text"])
    test_discourse_tfidf = tf_idf.transform(test_data["discourse_text"])
    
    train_text_tfidf = tf_idf.fit_transform(train_data_["text"])
    eval_text_tfidf = tf_idf.transform(eval_data_["text"])
    test_text_tfidf = tf_idf.transform(test_data["text"])
    
    one_hot_encoded_train_data =  sparse.csr_matrix(one_hot_encode.fit_transform(train_data_["discourse_type"].values.reshape(-1,1)))
    one_hot_encoded_eval_data =  sparse.csr_matrix(one_hot_encode.transform(eval_data_["discourse_type"].values.reshape(-1,1)))
    one_hot_encoded_test_data =  sparse.csr_matrix(one_hot_encode.transform(test_data["discourse_type"].values.reshape(-1,1)))
    
    train_tfidf = sparse.hstack((one_hot_encoded_train_data,train_discourse_tfidf,train_text_tfidf))
    eval_tfidf = sparse.hstack((one_hot_encoded_eval_data,eval_discourse_tfidf,eval_text_tfidf))
    test_tfidf = sparse.hstack((one_hot_encoded_test_data,test_discourse_tfidf,test_text_tfidf))
    
    rf_classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
    rf_classifier.fit(train_tfidf, train_data_["target"].values)
    
    eval_preds = rf_classifier.predict_proba(eval_tfidf)
    eval_loss = log_loss(eval_data_["target"].values,eval_preds)
    print("Fold : {} EV score: {}".format(fold,eval_loss))
    
    preds.append(rf_classifier.predict_proba(test_tfidf))

Fold : 0 EV score: 2.912854014185138
Fold : 1 EV score: 2.730143671678094
Fold : 2 EV score: 3.0301836537658384
Fold : 3 EV score: 2.649121652420133
Fold : 4 EV score: 2.657398343403062


In [10]:
submission_data = pd.read_csv("../input/feedback-prize-effectiveness/sample_submission.csv")

In [11]:
final_result = np.array(preds).mean(0)

In [12]:
submission_data.loc[:,"Ineffective"] = final_result[:,0]
submission_data.loc[:,"Adequate"] = final_result[:,1]
submission_data.loc[:,"Effective"] = final_result[:,2]
submission_data.head(2)

Unnamed: 0,discourse_id,Ineffective,Adequate,Effective
0,a261b6e14276,0.02,0.52,0.46
1,5a88900e7dc1,0.02,0.5,0.48


In [13]:
submission_data.to_csv('submission.csv',index=None)