In [1]:
import pandas as pd 

In [2]:
import numpy

In [3]:
data = pd.read_excel('training_set_rel3.xls')
df = data[data['essay_set']==1]
df.shape

(1783, 28)

In [4]:
X = df['essay']
X.head()

0    Dear local newspaper, I think effects computer...
1    Dear @CAPS1 @CAPS2, I believe that using compu...
2    Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...
3    Dear Local Newspaper, @CAPS1 I have found that...
4    Dear @LOCATION1, I know having computers has a...
Name: essay, dtype: object

In [5]:
Y = df['domain1_score']
Y.head()

0     8.0
1     9.0
2     7.0
3    10.0
4     8.0
Name: domain1_score, dtype: float64

# Tokenize , stemming and remove punctuation 

In [6]:
import nltk

In [7]:
from nltk.stem.porter import PorterStemmer

In [8]:
import string

In [9]:
stemmer = PorterStemmer()
def stem_tokens(tokens,stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

In [10]:
def tokenized(text):
    tokens = nltk.word_tokenize(text)
    tokens = [i for i in tokens if i not in string.punctuation]
    stems = stem_tokens(tokens,stemmer)
    return stems

# Create tfidf tabel

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
tfidf_data = TfidfVectorizer(tokenizer=tokenized,stop_words='english').fit_transform(X)

In [13]:
tfidf_data

<1783x11832 sparse matrix of type '<class 'numpy.float64'>'
	with 181766 stored elements in Compressed Sparse Row format>

In [14]:
tfidf_data.todense()

matrix([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [15]:
tfidf_data

<1783x11832 sparse matrix of type '<class 'numpy.float64'>'
	with 181766 stored elements in Compressed Sparse Row format>

In [16]:
A = tfidf_data.todense()

In [17]:
A

matrix([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

# SVD create from tfidf matrix A

In [18]:
from numpy.linalg import svd

In [19]:

u,s,v = svd(A,full_matrices=False)

In [20]:
p = 0
for i in s:
    if (i>0.65):
        p = p+1
print(p)
        

906


In [21]:
reduce = len(s)-p
reduce

877

In [22]:
ua = u.shape[1]-reduce
ua

906

In [23]:
U = u[:,:ua]
U

matrix([[ -2.08734085e-02,  -6.48327201e-04,  -3.96563636e-05, ...,
          -3.68486856e-03,  -2.07761869e-02,   5.72591710e-02],
        [ -2.54860043e-02,  -2.41330647e-02,   2.94488323e-02, ...,
          -2.74557801e-02,  -2.85402672e-02,   3.54441535e-02],
        [ -3.24588714e-02,   2.24575414e-02,  -7.81003397e-03, ...,
          -9.70551746e-03,   2.54126687e-03,   3.26698008e-02],
        ..., 
        [ -3.35761915e-02,   2.86908225e-02,  -4.86646223e-02, ...,
           6.18265855e-03,   2.45435528e-02,  -4.12269500e-03],
        [ -1.26266785e-02,   2.88217898e-03,   1.15799422e-03, ...,
           1.22104780e-02,   3.36763871e-02,   1.62096135e-02],
        [ -2.04815527e-02,  -7.58849517e-05,   3.49124798e-03, ...,
           1.36747972e-03,  -2.43636409e-02,   3.69020864e-02]])

In [24]:
U.shape

(1783, 906)

In [25]:
va = v.shape[0]-reduce
va

906

In [26]:
V = v[:va]
V.shape

(906, 11832)

In [27]:
from numpy import diag

In [28]:
S = diag(s[:p])
S.shape

(906, 906)

In [29]:
S

array([[ 16.40592225,   0.        ,   0.        , ...,   0.        ,
          0.        ,   0.        ],
       [  0.        ,   5.64130738,   0.        , ...,   0.        ,
          0.        ,   0.        ],
       [  0.        ,   0.        ,   4.21247506, ...,   0.        ,
          0.        ,   0.        ],
       ..., 
       [  0.        ,   0.        ,   0.        , ...,   0.65137614,
          0.        ,   0.        ],
       [  0.        ,   0.        ,   0.        , ...,   0.        ,
          0.65130649,   0.        ],
       [  0.        ,   0.        ,   0.        , ...,   0.        ,
          0.        ,   0.65032644]])

In [30]:
svd_matrix = U*S*V

In [31]:
svd_matrix.shape

(1783, 11832)

In [32]:
from scipy import sparse

In [33]:
full_svd = sparse.csc_matrix(svd_matrix)

In [34]:
full_svd

<1783x11832 sparse matrix of type '<class 'numpy.float64'>'
	with 21096456 stored elements in Compressed Sparse Column format>

# Trainig test split form full_svd and Y 

In [35]:
from sklearn.cross_validation import train_test_split



In [36]:
x1,x2,y1,y2 = train_test_split(full_svd,Y,random_state=0,test_size=0.1)

In [37]:
x1.shape,y1.shape,x2.shape,y2.shape

((1604, 11832), (1604,), (179, 11832), (179,))

# Support Vector Machine

In [38]:
from sklearn import svm

In [39]:
model = svm.SVC()

In [40]:
model.fit(x1,y1)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [41]:
model.score(x2,y2)

0.39106145251396646

In [42]:
from sklearn.model_selection import cross_val_score

In [43]:
score = cross_val_score(model,x2,y2,cv=5)



In [44]:
score

array([ 0.35897436,  0.37837838,  0.38888889,  0.4       ,  0.4375    ])

In [45]:
y3 = model.predict(x2)

In [46]:
for i in y3:
    print(i)

8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0


In [47]:
for i in y2:
    print(i)

7.0
8.0
10.0
7.0
10.0
10.0
12.0
10.0
7.0
10.0
9.0
9.0
10.0
7.0
8.0
7.0
8.0
8.0
8.0
8.0
6.0
9.0
6.0
10.0
11.0
10.0
8.0
8.0
10.0
9.0
8.0
11.0
9.0
8.0
10.0
9.0
8.0
9.0
8.0
6.0
10.0
8.0
8.0
8.0
10.0
9.0
10.0
8.0
6.0
8.0
10.0
8.0
10.0
8.0
8.0
9.0
8.0
8.0
8.0
8.0
11.0
7.0
8.0
8.0
9.0
8.0
8.0
9.0
7.0
10.0
9.0
7.0
8.0
10.0
9.0
11.0
9.0
8.0
8.0
10.0
6.0
8.0
10.0
8.0
10.0
8.0
5.0
8.0
10.0
10.0
10.0
9.0
12.0
11.0
8.0
2.0
10.0
7.0
8.0
10.0
7.0
8.0
6.0
8.0
9.0
8.0
10.0
6.0
10.0
10.0
8.0
8.0
9.0
12.0
11.0
4.0
10.0
11.0
8.0
9.0
8.0
8.0
9.0
8.0
11.0
10.0
8.0
9.0
9.0
7.0
9.0
9.0
8.0
10.0
10.0
6.0
8.0
8.0
8.0
8.0
10.0
8.0
8.0
8.0
10.0
10.0
9.0
8.0
8.0
9.0
11.0
9.0
7.0
9.0
10.0
8.0
9.0
8.0
8.0
10.0
9.0
8.0
8.0
8.0
8.0
8.0
8.0
10.0
8.0
9.0
7.0
8.0
10.0
12.0
8.0
2.0
11.0
8.0
7.0


In [48]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [49]:
model1 = KNeighborsClassifier(n_neighbors=5)
model1.fit(x1,y1)
model1.score(x2,y2)

0.40782122905027934

In [50]:
model2 = LogisticRegression()
model2.fit(x1,y1)
model2.score(x2,y2)

0.44134078212290501

In [58]:
model = KNeighborsClassifier(n_neighbors=10)

In [59]:
from sklearn.cross_validation import cross_val_score


In [61]:
cross_val_score(model,full_svd,Y,cv=5)



array([ 0.37119114,  0.36768802,  0.34929577,  0.34366197,  0.35977337])