In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
tqdm.pandas()
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from scipy import sparse
from xgboost import XGBClassifier
from sklearn.metrics import log_loss
# import re
# import nltk
# from nltk.corpus import stopwords
# from nltk.stem.porter import PorterStemmer

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_data = pd.read_csv('/kaggle/input/feedback-prize-effectiveness/train.csv')
test_data = pd.read_csv('/kaggle/input/feedback-prize-effectiveness/test.csv')

train_data['text'] = train_data['essay_id'].apply(lambda x: open(f'/kaggle/input/feedback-prize-effectiveness/train/{x}.txt').read())
test_data['text'] = test_data['essay_id'].apply(lambda x: open(f'/kaggle/input/feedback-prize-effectiveness/test/{x}.txt').read())
train_data.head(2)

In [None]:
encoder = LabelEncoder()
train_data['discourse_effectiveness'] = encoder.fit_transform(train_data['discourse_effectiveness'])
train_data['discourse_type'] = encoder.fit_transform(train_data['discourse_type'])
test_data['discourse_type'] = encoder.fit_transform(test_data['discourse_type'])

In [None]:
skfold = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
for i,(train_index, test_index) in enumerate(skfold.split(train_data, train_data["discourse_effectiveness"])):
    train_data.loc[test_index,"fold"] = i

In [None]:
tf_idf = TfidfVectorizer(ngram_range=(1,2),norm='l2', smooth_idf=True)
one_hot_encode = OneHotEncoder()
preds = []

In [None]:
for fold in range(5):
    train_data_ = train_data[train_data['fold']!=fold]
    eval_data_ = train_data[train_data['fold']==fold]
    
    train_discourse_tfidf = tf_idf.fit_transform(train_data_["discourse_text"])
    eval_discourse_tfidf = tf_idf.transform(eval_data_["discourse_text"])
    test_discourse_tfidf = tf_idf.transform(test_data["discourse_text"])
    
    train_text_tfidf = tf_idf.fit_transform(train_data_["text"])
    eval_text_tfidf = tf_idf.transform(eval_data_["text"])
    test_text_tfidf = tf_idf.transform(test_data["text"])
    
    one_hot_encoded_train_data =  sparse.csr_matrix(one_hot_encode.fit_transform(train_data_["discourse_type"].values.reshape(-1,1)))
    one_hot_encoded_eval_data =  sparse.csr_matrix(one_hot_encode.transform(eval_data_["discourse_type"].values.reshape(-1,1)))
    one_hot_encoded_test_data =  sparse.csr_matrix(one_hot_encode.transform(test_data["discourse_type"].values.reshape(-1,1)))
    
    train_tfidf = sparse.hstack((one_hot_encoded_train_data,train_discourse_tfidf,train_text_tfidf))
    eval_tfidf = sparse.hstack((one_hot_encoded_eval_data,eval_discourse_tfidf,eval_text_tfidf))
    test_tfidf = sparse.hstack((one_hot_encoded_test_data,test_discourse_tfidf,test_text_tfidf))
    
    clf = XGBClassifier(random_state=42, seed=2, colsample_bytree=0.6, subsample=0.7)
    clf.fit(train_tfidf, train_data_["discourse_effectiveness"].values)
    
    eval_preds = clf.predict_proba(eval_tfidf)
    eval_loss = log_loss(eval_data_["discourse_effectiveness"].values,eval_preds)
    print("Fold : {} EV score: {}".format(fold,eval_loss))
    
    preds.append(clf.predict_proba(test_tfidf))

In [None]:
submission_data = pd.read_csv("../input/feedback-prize-effectiveness/sample_submission.csv")

In [None]:
final_result = np.array(preds).mean(0)
submission_data.loc[:,"Ineffective"] = final_result[:,0]
submission_data.loc[:,"Adequate"] = final_result[:,1]
submission_data.loc[:,"Effective"] = final_result[:,2]
submission_data.to_csv('submission.csv',index=None)