# Testing whether various fields can be predicted by the text


In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.multiclass import OneVsOneClassifier
from sklearn import cross_validation, grid_search
from sklearn.pipeline import Pipeline
from sklearn import linear_model, ensemble, svm
from sklearn.svm import LinearSVC

import nltk
from nltk.corpus import stopwords, words

import matplotlib
matplotlib.style.use('ggplot')

In [3]:
## READ IN FILE, CLEAN UP EXTRANEOUS CHARACTERS
def strfy(x):
    return x.fillna(".").map(str)


df_all = pd.read_csv('corporate_culture_trunc.csv', encoding='latin-1') #reads csv into dataframe
df_all = df_all.dropna(subset=['Q1']) #drop row if no description (NaN in Q1)
dfs = df_all.Q1
dfs = [dfs1.encode('utf-8') for dfs1 in dfs]
df_all.Q1 = dfs
df_all['merge_text'] = df_all.Q1.map(str) + strfy(df_all.Q6_Other) + df_all.Q13_5_Other.fillna(".").map(str)


Var. 118	Fmt: N2	Col: 16850-16851	Name: firm_location (DN)
	11b. Where is your firm located?
	1=US/Canada	2=Africa	3=Asia	4=Europe	5=Latin America	

In [28]:
country1=3 #Asia
country2=4 #Europe
col_name="firm_location"

df_all = df_all.dropna(subset=[col_name]) #drop row if no description (NaN in firm_location)

df_hl = df_all.loc[(df_all[col_name] == country1) | (df_all[col_name] == country2)]
print(float(len(df_hl[df_hl[col_name]==country1]))/float(len(df_hl)))

x_train, x_test, y_train, y_test = cross_validation.train_test_split(df_hl['merge_text'],df_hl[col_name],test_size=0.2)

mypipeline=Pipeline([
  ('cvect', CountVectorizer(ngram_range=(1, 2), stop_words=nltk.corpus.stopwords.words('english'))),
  ('tfidf', TfidfTransformer()),
  ('rclassify', svm.SVC(gamma=2))
])

mypipeline.fit(x_train,y_train)
print(mypipeline.score(x_test,y_test))


0.435897435897
0.625


There does not seem to be a significant difference in the description of culture given in [Q1] between different countries.

Var. 114	Fmt: N2	Col: 16841-16842	Name: Ownership (DJ)
	Ownership
	1=Public	2=Private	3=Government or non-profit	


In [23]:
type1=3
type2=2
col_name="Ownership"

df_all = df_all.dropna(subset=[col_name]) #drop row if no description (NaN in firm_location)

df_hl = df_all.loc[(df_all[col_name] == type1) | (df_all[col_name] == type2)]
print(float(len(df_hl[df_hl[col_name]== type1]))/float(len(df_hl)))

x_train, x_test, y_train, y_test = cross_validation.train_test_split(df_hl['merge_text'],df_hl[col_name],test_size=0.2)

mypipeline=Pipeline([
  ('cvect', CountVectorizer(ngram_range=(1, 2), stop_words=nltk.corpus.stopwords.words('english'))),
  ('tfidf', TfidfTransformer()),
  ('rclassify', ensemble.RandomForestClassifier())
])

mypipeline.fit(x_train,y_train)
print(mypipeline.score(x_test,y_test))


0.161706349206
0.841584158416


In [None]:
No significant difference re: ownership, either