In [1]:
import numpy as np
import pandas as pd 

#sklearn processing
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/feedback-prize-english-language-learning/sample_submission.csv
/kaggle/input/feedback-prize-english-language-learning/train.csv
/kaggle/input/feedback-prize-english-language-learning/test.csv


In [2]:
sample_submission_raw = pd.read_csv('/kaggle/input/feedback-prize-english-language-learning/sample_submission.csv')
train_data_raw = pd.read_csv('/kaggle/input/feedback-prize-english-language-learning/train.csv')
test_data = pd.read_csv('/kaggle/input/feedback-prize-english-language-learning/test.csv')

In [3]:

train, test = train_test_split(train_data_raw, test_size=0.2, random_state=21) #20% for test
train, val = train_test_split(train, test_size=0.1, random_state=21) # 10% for validation
train

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
3047,D9AAEE4C9BCC,Do you think that influencing other is imporan...,3.5,3.5,2.5,3.0,3.0,3.0
2848,CEF302996231,"Dear Dr. Generic_Name,\n\nThank you for taking...",3.0,3.0,3.0,2.5,3.0,3.0
1138,584A4BB1E954,Should these summer projects be teacher-design...,2.5,3.0,3.0,2.5,3.0,3.0
2824,CDDCA13DB734,The determination make us to do something and ...,3.5,3.0,3.0,2.5,2.5,3.5
1298,63AB55DADABF,Do you think we should extend our school day? ...,4.0,4.0,4.0,3.5,4.5,3.5
...,...,...,...,...,...,...,...,...
3654,F58DF8739B42,Dear Mr. Principal today I want to talk abot h...,3.0,3.0,3.5,2.5,3.0,2.0
3034,D92BFF2AC7FD,Do you think is a good idea to change the scho...,3.5,3.5,3.0,3.0,3.0,3.0
2978,D5C7212A31FD,20/11/2010\n\nMr. reader`s how are you?\n\nThi...,2.5,2.5,2.0,2.5,2.0,2.0
1887,8E5E6211E4D6,Have you ever wondered how is it like being in...,3.0,3.0,3.5,3.5,3.5,3.0


In [4]:
def predict(df):
    df['cohesion'] = train['cohesion'].median()
    df['syntax'] = train['syntax'].median()
    df['vocabulary'] = train['vocabulary'].median()
    df['phraseology'] = train['phraseology'].median()
    df['grammar'] = train['grammar'].median()
    df['conventions'] = train['conventions'].median()
    return df

train_predictions = predict(train.copy())
val_predictions = predict(val.copy())
test_predictions = predict(test.copy())

In [5]:
def calc_rmse(actual_df, predicted_df):
    
    parameters_to_predict = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']

    rmse_scores = []
    mse_scores = []
    for parameter in parameters_to_predict:
        rmse = np.sqrt(((actual_df[parameter] - predicted_df[parameter])**2).mean())
        rmse_scores.append(rmse)
        
        mse = ((actual_df[parameter] - predicted_df[parameter])**2).mean()
        mse_scores.append(mse)
    
    rmse = np.mean(rmse_scores).round(2)
    mse = np.mean(mse_scores).round(2)

    return mse, rmse 

train_scores = calc_rmse(train, train_predictions)
val_scores   = calc_rmse(val, val_predictions)
test_scores  = calc_rmse(test, test_predictions)

print(f'Train: {train_scores}')
print(f'Val: {val_scores}')
print(f'Test: {test_scores}')

Train: (0.44, 0.66)
Val: (0.49, 0.7)
Test: (0.44, 0.66)


In [6]:
def baseline_prediction(parameter):
    return train[parameter].median()

test_data['cohesion'] = baseline_prediction('cohesion')
test_data['syntax'] = baseline_prediction('syntax')
test_data['vocabulary'] = baseline_prediction('vocabulary')
test_data['phraseology'] = baseline_prediction('phraseology')
test_data['grammar'] = baseline_prediction('grammar')
test_data['conventions'] = baseline_prediction('conventions')
test_data

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,when a person has no experience on a job their...,3.0,3.0,3.0,3.0,3.0,3.0
1,000BAD50D026,Do you think students would benefit from being...,3.0,3.0,3.0,3.0,3.0,3.0
2,00367BB2546B,"Thomas Jefferson once states that ""it is wonde...",3.0,3.0,3.0,3.0,3.0,3.0


In [7]:
res = test_data.drop("full_text",axis=1)
res.to_csv("/kaggle/working/submission.csv",index=False)