# scip classification

##### Things to do: 

- Consider feature selection [http://scikit-learn.org/stable/modules/feature_selection.html](link here) and weighting

### setting up, loading data

##### choose stoplist_1 or stoplist_2 

In [1]:
import pandas as pd
import nltk

# Stoplists
stoplist = pd.read_csv("~/dropbox/research/scip/scientific_practices_josh/generality/stoplist_1.csv", header = None)
# stoplist = pd.read_csv("~/dropbox/research/scip/scientific_practices_josh/generality/stoplist_2.csv", header = None)

##### for new data - change to where file is saved in directory

In [2]:
# comment out if using original data
data = pd.read_csv("~/dropbox/research/scip/scientific_practices_josh/generality/new_data.csv")

##### for original data - change to where file is saved in directory

In [3]:
# comment out if using new data
# data = pd.read_csv("~/dropbox/research/scip/scientific_practices_josh/generality/old_data.csv")

### descriptives (dimensions, column names, and frequency of codes)

In [4]:
data.shape # dimensions

(174, 14)

In [5]:
data.columns.values # column names

array(['spec', 'gen', 'blank', 'both', 'gen_or_spec_code',
       'gen_or_spec_new', 'gen_or_spec', 'text', 'code', 'bin5', 'bin4',
       'bin3', 'bin2', 'bin2_test'], dtype=object)

##### for original data--can also choose which code / bins to use (change data.code assignment)

In [6]:
# Comment out if using new data
# data.code = data.bin2 # change to select other codes
# data.text = data.Why
# data.gen_or_spec = data.gen_gen_spec_spec

##### for new data --can also choose which code / bins to use (change data.code assignment)

In [23]:
# Comment out if using original data
data.gen_or_spec = data.gen_or_spec_code
data.code = data.code # 

In [24]:
data.code.value_counts() # frequency of codes

4    54
2    51
1    41
3    15
0    13
Name: code, dtype: int64

### preliminary processing

In [25]:
import re
from nltk.stem import *
stemmer = PorterStemmer()

proc_text = []

for i in data.index:
    tmp_text = re.sub("\d+", "", data.text[i]) # removes numbers  
    tmp_text = re.sub(r'[?|$|.|!]',r'', tmp_text) # removes puncutation
    tmp_text = tmp_text.lower() # makes text lowercase
    tmp_text = ([i for i in tmp_text.split() if i not in stoplist]) # tokenizes and removes stopwords
    tmp_text = ' '.join(tmp_text) # joins words back to stopwords
    tmp_text = stemmer.stem(tmp_text) # stems words
    proc_text.append(tmp_text)
    
data.proc_text = proc_text

### make term document matrix

In [37]:
# This 'tdm_df' function is more or less copied from the NLTK website
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer 
import numpy as np

def tdm_df(docs, xColNames = None, **kwargs):
    vectorizer = CountVectorizer(**kwargs)
    x1 = vectorizer.fit_transform(docs)
    df = pd.DataFrame(x1.toarray().transpose(), index = vectorizer.get_feature_names())
    if xColNames is not None:
        df.columns = xColNames
    return df

out_tdm = tdm_df(data.proc_text) # creates term document matrix
out_tdm.gen = data.gen # these add the dummy codes for gen or spec
out_tdm.spec = data.spec
out_tdm.blank = data.blank
out_tdm.both = data.both
out_tdm = out_tdm.transpose()

codes_list = data.code.tolist()
# codes_list = ['%.1f' % num for num in codes_list] # need this with new data

codes_array = np.array(codes_list)
out_array = np.array(out_tdm, dtype="|S6")
out_tdm

Unnamed: 0,about,accur,accurate,acedic,acedit,acet,acetate,acetic,acid,age,...,without,wont,work,works,would,wouldnt,yes,you,your,youre
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
7,0,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### classifying

In [38]:
from sklearn.cross_validation import KFold
from sklearn.naive_bayes import GaussianNB
from statistics import mean

gnb = GaussianNB()

from sklearn.cross_validation import KFold

X = out_array
y = codes_array

kf = KFold(174, n_folds = 174, shuffle = True)

res_out = []
test_list = [] # for true y codes
y_test_list = [] # for test y codes

for train_index, test_index in kf:
        
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    X_train = X_train.astype(float) # features must be numeric
    X_test = X_test.astype(float)
    
    train_fit = gnb.fit(X_train, y_train)
    test = gnb.predict(X_test)
    # res = test == y_test
    test_list.extend(test)
    y_test_list.extend(y_test)
    # res = sum(res.tolist()) / len(res.tolist())
    # res_out.append(res)
    
test_list = pd.Series(test_list)
y_test_list = pd.Series(y_test_list)

#### agreement of k-folds validation with 10 folds:

In [41]:
int(sum(test_list == y_test_list) / len(y_test_list) * 100)
# test_list.astype(float)

63

#### cohen's kappa of k-folds validation with 10 folds:

In [42]:
from sklearn.metrics import cohen_kappa_score
cohen_kappa_score(test_list, y_test_list)

0.50303744673134465

In [30]:
#### confusion matrix:

In [43]:
test_list = pd.Series(test_list)
y_test_list = pd.Series(y_test_list)

x = pd.crosstab(test_list, y_test_list, rownames=['True'], colnames=['Predicted'], margins = True)
x

Predicted,0,1,2,3,4,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,6,0,0,0,0,6
1,3,25,8,3,5,44
2,1,4,30,6,1,42
3,0,3,1,2,0,6
4,3,9,12,4,48,76
All,13,41,51,15,54,174
