In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv
/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv
/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv


import other libraries

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import cohen_kappa_score
import re
from sklearn.pipeline import Pipeline

Read in the dataset

In [3]:
df_train = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv')
df_test = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv')
df_sub = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv')

Data cleaning

In [4]:
def removeHTML(x):
    html=re.compile(r'<.*?>')
    return html.sub(r'',x)

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    phrase = re.sub(r"\xa0"," ",phrase)
    return phrase.strip()

def clean_text(x):
    # Convert words to lowercase
    x = x.lower()
    # Remove HTML
    x = removeHTML(x)
    # Delete strings starting with @
    x = re.sub("@\w+", '',x)
    # Delete Numbers
    x = re.sub("'\d+", '',x)
    x = re.sub("\d+", '',x)
    # Delete URL
    x = re.sub("http\w+", '',x)
    # Replace consecutive empty spaces with a single space character
    x = re.sub(r"\s+", " ", x)
    # Replace consecutive commas and periods with one comma and period character
    x = re.sub(r"\.+", ".", x)
    x = re.sub(r"\,+", ",", x)
    # Remove empty characters at the beginning and end
    x = x.strip()
    return decontracted(x)

In [5]:
df_train['full_text'] = df_train['full_text'].apply(clean_text)
df_test['full_text'] = df_test['full_text'].apply(clean_text)


In [6]:
df_train['full_text'][0]

'many people have car where they live. the thing they do not know is that when you use a car alot of thing can happen like you can get in accidet or the smoke that the car has is bad to breath on if someone is walk but in vauban,germany they dont have that proble because percent of vauban is families do not own cars,and percent sold a car to move there. street parkig ,driveways and home garages are forbidden on the outskirts of freiburd that near the french and swiss borders. you probaly will not see a car in vauban is streets because they are completely "car free" but if some that lives in vauban that owns a car ownership is allowed,but there are only two places that you can park a large garages at the edge of the development,where a car owner buys a space but it not cheap to buy one they sell the space for you car for $, along with a home. the vauban people completed this in ,they said that this an example of a growing trend in europe,the untile states and some where else are suburba

feature extraction

create a word_count column

In [7]:
df_train['word_count'] = df_train["full_text"].str.split().str.len()
df_test['word_count'] = df_test["full_text"].str.split().str.len()

In [8]:
df_train.head()

Unnamed: 0,essay_id,full_text,score,word_count
0,000d118,many people have car where they live. the thin...,3,496
1,000fe60,i am a scientist at nasa that is discussing th...,3,336
2,001ab80,people always wish they had the same technolog...,4,553
3,001bdc0,"we all heard about venus, the planet without a...",4,450
4,002ba53,"dear, state senator this is a letter to argue ...",3,377


In [9]:
X_train = df_train['full_text']
y_train = df_train['score']
X_test = df_test['full_text']
# y_test = pd.DataFrame()
y_true = df_sub['score']

# X_train = tfidf_vectorizer.fit_transform(df_train['full_text'])
# y_train = df_train['score']
# X_test = tfidf_vectorizer.transform(df_test['full_text'])
# y_test = df_test['target_column']

create the tfidf matrix

In [10]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', min_df=0.05, max_df=0.95,)
tfidf = tfidf_vectorizer.fit_transform(df_train['full_text'])


create the multinomial logistic regression classifier

In [11]:
logreg_clf = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)

create a pipeline for logistic regression classifier

In [12]:
logreg_pipeline = Pipeline(steps=[
    ('tfidf', tfidf_vectorizer),
    ('classifier', logreg_clf)
])

In [13]:
logreg_pipeline.fit(X_train, y_train)
y_pred = logreg_pipeline.predict(X_test)

In [14]:
y_pred

array([1, 3, 4])

In [15]:
kappa = cohen_kappa_score(y_true, y_pred, weights='quadratic')
kappa


0.6666666666666667