<a href="https://colab.research.google.com/github/kat-tian/personality_detection/blob/master/personality_XGboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preprocessing and XGboost for data 

1. Import dataset (essays.csv)
2. Load embeddings (Mount to Drive)
3. Function to clean data (strip, lower, non-ASCII)
4. Lemmatize 
5. Vectorize (vector of each word, sum arrays for each word: one number/word)
6. Set vectors to fixed-length (pad short, cut long)
7. Convert categorical lables (y/n to 0/1)






>[Preprocessing and XGboost for data](#scrollTo=Yoje3OcUegNP)

>>[Setup Installs](#scrollTo=BqCCKE4IgY47)

>>[Data Preprocessing](#scrollTo=fG-OpEPHwzD0)

>>>[Define Functions](#scrollTo=fG-OpEPHwzD0)

>>>[Prepare Train/Test](#scrollTo=LRsxLQeJ_BXd)

>>[Build XGBoost Model](#scrollTo=7boYySSukXJo)



## Setup Installs

In [0]:
#standard imports

import numpy as np
import pandas as pd
import re 
import string

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk import sent_tokenize, word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tag import pos_tag
stop_words = set(stopwords.words('english'))


import xgboost as xgb
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [0]:
#mount drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# import embeddings
from gensim.models import KeyedVectors
filename = '/content/drive/My Drive/trained_models/GoogleNews-vectors-negative300.bin.gz'
emb_model = KeyedVectors.load_word2vec_format(filename, binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


## Data Preprocessing

### Define Functions
1. text_preprocess: tokenzie, to lower, remove punct,non-ASCII, and numbers (save as 'CLEAN_TEXT')
2. lemmatize: using list returned from previous, lemmatize (update 'CLEAN_TEXT')
3. get_vectors: use clean_text list, apply get vectors to get array for each word, then sum array each word (save as 'TEXT_VECTORS')
4. count_list: count length of lists for each value 
5. fixed_vector: pad short vect, cut long (save as 'FIXED_VECTORS')


Finally, encode the categorical data to 0/1 using a mapping. 

In [0]:
# reading our data
data = pd.read_csv('essays.csv', encoding='latin1')

In [0]:
#define all functions 

def text_preprocess(text_str):
  """ Takes string as input, tokenizes, remove punct, 
      to lower, strip, and remove stop words"""
  
  text = text_str.strip().lower()
  text = re.sub('<[^<]+?>','', text)
  text = re.sub(r'[^\x00-\x7f]',r'', text)
  text = ''.join(c for c in text if not c.isdigit())
  text = text.translate(str.maketrans("","", string.punctuation))
  tokens = word_tokenize(text)
  result = [i for i in tokens if i not in stop_words]
  return result


def lemmatize(list):
  """Take the list returned from previous function
  lemmatize"""
  
  lemmatizer = WordNetLemmatizer()
  lemma = [lemmatizer.lemmatize(x) for x in list]
  return lemma


def get_vectors(list):
  """For word in list, get embedding,
  add embeddings for word, return all words in list"""

  vectors = []
  for word in list: 
    try:
      vector = emb_model[word]
      vector = np.sum(vector)
      vectors.append(vector)
      
    except KeyError: 
      pass
     
  return vectors


def count_list(list):
  """count items in list, return count"""
  return len(list)


#PROBLEM WITH THIS FUNCTION 
def fixed_vector(vec_list):
  """create fixed length vectors for model input"""
  
  if len(vec_list)>600:
    vec_list = vec_list[:600]
    
  elif len(vec_list)<600: 
    difference = 600-int(len(vec_list))
    vec_list.extend([0.0] * difference)
    
  return vec_list

def sum_vectors(vec_list):
  """Take dense vector list as input
  return the sum elementwise"""
  return np.sum(vec_list, axis=0)

def dense_vectors(list):
  """For word in list, get embedding,
  add embeddings for word, return all words in list"""

  vectors = []
  for word in list: 
    try:
      vector = emb_model[word]
      vectors.append(vector)
      
    except KeyError: 
      pass
     
  return vectors
    

In [0]:
#apply functions 
data['CLEAN_TEXT'] = data['TEXT'].apply(text_preprocess) #clean text
data['CLEAN_TEXT'] = data['CLEAN_TEXT'].apply(lemmatize) #lemmatize
data['DENSE_VECTORS'] = data['CLEAN_TEXT'].apply(dense_vectors)
data['TEXT_VECTORS'] = data['CLEAN_TEXT'].apply(get_vectors) #create vectors
data['VECTOR_COUNT'] = data['TEXT_VECTORS'].apply(count_list) #count len vectors
data['FIXED_VECTORS'] = data['TEXT_VECTORS'].apply(fixed_vector) #vecs to fix length 
data['FIXED_VEC_COUNT'] = data['FIXED_VECTORS'].apply(count_list) #len of fix-vec list
data['SUM_ELEMENTWISE_VECTORS'] = data['DENSE_VECTORS'].apply(sum_vectors) #sum-vectors are the elementwise sum for dense vectors 


In [0]:
#encode categorical data to 0,1
cleanup_nums = {'y': 1.0, 'n': 0.0}
data.replace(cleanup_nums, inplace=True)

  op = lambda x: operator.eq(x, b)


In [0]:
data['FIXED_VEC_COUNT'].value_counts()

600    2467
Name: FIXED_VEC_COUNT, dtype: int64

In [0]:
print(data.sample(2))
print('len:', len(data['TEXT']))

              #AUTHID  ...                            SUM_ELEMENTWISE_VECTORS
2416     2004_390.txt  ...  [11.978897, 14.949148, 6.8908997, 76.28322, -3...
1804  2002_819526.txt  ...  [17.69645, 13.626469, 7.5613594, 38.349854, -2...

[2 rows x 14 columns]
len: 2467


In [0]:
data['FIXED_VECTORS'][0:2]

0    [-0.59524536, -0.46329117, 7.6182823, -3.18063...
1    [-0.59524536, 2.0501537, 0.39393616, 0.7791672...
Name: FIXED_VECTORS, dtype: object

### Prepare Train/Test
Prepare input and labels. Fix the dimensions, pad missing values, and check the stuctures.

In [0]:
#features and labels
features = np.matrix([0]*300)
for val in data['SUM_ELEMENTWISE_VECTORS'].values:
  features = np.append(features, [val], axis=0)
features = features[1:]
  
labels_ext = data['cEXT'].values


In [0]:
#train test/split using sklearn 
seed = 22
test_size = 0.30
x_train, x_test, y_train, y_test = train_test_split(features, labels_ext, test_size=test_size, random_state=seed)

In [0]:
#check shapes
print('y_test shape:', y_test.shape)
print('x_test shape:', x_test.shape)
print('x_train shape:', x_train.shape)
print('y_train shape:', y_train.shape)

y_test shape: (741,)
x_test shape: (741, 300)
x_train shape: (1726, 300)
y_train shape: (1726,)


## Build XGBoost Model 

In [0]:
# fit model no training data
model = XGBClassifier()
model.fit(x_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [0]:
y_pred = model.predict(x_test)

In [0]:
# evaluate predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 52.77%
