In [21]:
# Feature based Employer Recommendation System 

In [1]:
%whos

Interactive namespace is empty.


In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MinMaxScaler, RobustScaler

## Adding clean dataset and categorizing dataset for Size, Revenue and Industry(Area) 

In [2]:
dfnewest=pd.read_csv('Oct3_EDA.csv')

In [3]:
#exploring dataset
dfnewest.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Company,Competitors,Founded,Headquarters,Industry,Now known as,Part of,Revenue,Size,Type,Area
0,0,0,J.P. Morgan,,1799,"New York, NY",Investment Banking & Asset Management,,,$10+ billion (USD) per year,10000+ employees,Company - Public (JPM),Finance
1,1,1,IBM,,1911,"Armonk, NY",IT Services,,,$10+ billion (USD) per year,10000+ employees,Company - Public (IBM),Technology
2,2,2,Citi,,1812,"New York, NY",Investment Banking & Asset Management,,,$10+ billion (USD) per year,10000+ employees,Company - Public (C),Finance
3,3,3,Macy's,,1858,"Cincinnati, OH","Department, Clothing, & Shoe Stores",,,$10+ billion (USD) per year,10000+ employees,Company - Public (M),Fashion/Retail
4,4,4,Verizon,,2000,"New York, NY",Telecommunications Services,,,$10+ billion (USD) per year,10000+ employees,Company - Public (VZ),Technology


In [4]:
dfnewest.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'Company', 'Competitors', 'Founded',
       'Headquarters', 'Industry', 'Now known as', 'Part of', 'Revenue',
       'Size', 'Type', 'Area'],
      dtype='object')

In [5]:
# dropping unnecessary columns 
dfnewest = dfnewest.drop(['Unnamed: 0','Unnamed: 0.1','Now known as','Part of','Headquarters','Type','Industry','Competitors','Founded',], axis=1)

In [6]:
dfnewest.head()

Unnamed: 0,Company,Revenue,Size,Area
0,J.P. Morgan,$10+ billion (USD) per year,10000+ employees,Finance
1,IBM,$10+ billion (USD) per year,10000+ employees,Technology
2,Citi,$10+ billion (USD) per year,10000+ employees,Finance
3,Macy's,$10+ billion (USD) per year,10000+ employees,Fashion/Retail
4,Verizon,$10+ billion (USD) per year,10000+ employees,Technology


#### Categorizing Revenue feature into 4 ranges and making each range their own column

In [7]:
dfnewest['Revenue'] = dfnewest['Revenue'].map(lambda x: x.strip())

In [8]:
cleanup_nums = {"Revenue":     {"Less than $1 million (USD) per year": 1, "$1 to $5 million (USD) per year": 1,
                               "$5 to $10 million (USD) per year": 1, "$10 to $25 million (USD) per year": 1,
                                "$25 to $50 million (USD) per year": 1, "$50 to $100 million (USD) per year": 1,
                               "Unknown / Non-Applicable": 1, "$100 to $500 million (USD) per year": 2, 
                                "$500 million to $1 billion (USD) per year": 2, "$500 million to $1 billion (USD) per year": 2,
                               "$1 to $2 billion (USD) per year": 3, "$2 to $5 billion (USD) per year": 3, 
                                "$5 to $10 billion (USD) per year": 3, "$10+ billion (USD) per year": 4}}
                

In [9]:
dfnewest.replace(cleanup_nums, inplace=True)
dfnewest.head()

Unnamed: 0,Company,Revenue,Size,Area
0,J.P. Morgan,4,10000+ employees,Finance
1,IBM,4,10000+ employees,Technology
2,Citi,4,10000+ employees,Finance
3,Macy's,4,10000+ employees,Fashion/Retail
4,Verizon,4,10000+ employees,Technology


In [10]:
dfnewest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 498 entries, 0 to 497
Data columns (total 4 columns):
Company    498 non-null object
Revenue    498 non-null object
Size       498 non-null object
Area       498 non-null object
dtypes: object(4)
memory usage: 15.6+ KB


In [11]:
dfnewest['Revenue'].value_counts()

3                                    153
4                                    144
1                                     97
2                                     94
Unknown / Non-Applicable per year     10
Name: Revenue, dtype: int64

In [12]:
dfgetdummies=pd.get_dummies(dfnewest, columns=['Revenue'])

In [15]:
dfgetdummies.head(25)

Unnamed: 0,Company,Size,Area,Revenue_1,Revenue_2,Revenue_3,Revenue_4,Revenue_Unknown / Non-Applicable per year
0,J.P. Morgan,10000+ employees,Finance,0,0,0,1,0
1,IBM,10000+ employees,Technology,0,0,0,1,0
2,Citi,10000+ employees,Finance,0,0,0,1,0
3,Macy's,10000+ employees,Fashion/Retail,0,0,0,1,0
4,Verizon,10000+ employees,Technology,0,0,0,1,0
5,Morgan Stanley,10000+ employees,Finance,0,0,0,1,0
6,Goldman Sachs,10000+ employees,Finance,0,0,0,1,0
7,PwC,10000+ employees,Finance,0,0,1,0,0
8,EY,10000+ employees,Finance,0,0,0,1,0
9,Deloitte,10000+ employees,Finance,0,0,0,1,0


In [16]:
dfgetdummies.columns

Index(['Company', 'Size', 'Area', 'Revenue_1', 'Revenue_2', 'Revenue_3',
       'Revenue_4', 'Revenue_Unknown / Non-Applicable per year'],
      dtype='object')

In [17]:
#changing column name 
dfchange = dfgetdummies.rename(columns={'Revenue_1': '$100M_or_Less', 'Revenue_2': '$100M_to_1B', 'Revenue_3': '$1B_to_10B','Revenue_4': 'More_than_$10B',})

In [18]:
dfchange.head()

Unnamed: 0,Company,Size,Area,$100M_or_Less,$100M_to_1B,$1B_to_10B,More_than_$10B,Revenue_Unknown / Non-Applicable per year
0,J.P. Morgan,10000+ employees,Finance,0,0,0,1,0
1,IBM,10000+ employees,Technology,0,0,0,1,0
2,Citi,10000+ employees,Finance,0,0,0,1,0
3,Macy's,10000+ employees,Fashion/Retail,0,0,0,1,0
4,Verizon,10000+ employees,Technology,0,0,0,1,0


#### Now do same thing to Size - (making into 4 ranges and then make into its own column)

In [19]:
dfchange['Size'] = dfchange['Size'].map(lambda x: x.strip())

In [20]:
# first SIze make into 4 categories 
cleanup_nums2 = {"Size":     {"1 to 50 employees": 1, "Unknown": 1, "51 to 200 employees": 1, 
                               "201 to 500 employees": 1, "501 to 1000 employees": 1,
                                "5001 to 10000 employees": 3, "1001 to 5000 employees": 2,
                               "10000+ employees": 4}}

In [21]:
dfchange.replace(cleanup_nums2, inplace=True)
dfchange.head()

Unnamed: 0,Company,Size,Area,$100M_or_Less,$100M_to_1B,$1B_to_10B,More_than_$10B,Revenue_Unknown / Non-Applicable per year
0,J.P. Morgan,4,Finance,0,0,0,1,0
1,IBM,4,Technology,0,0,0,1,0
2,Citi,4,Finance,0,0,0,1,0
3,Macy's,4,Fashion/Retail,0,0,0,1,0
4,Verizon,4,Technology,0,0,0,1,0


In [22]:
dfchange['Size'].value_counts()

4    266
2    113
3     64
1     55
Name: Size, dtype: int64

In [23]:
dfchange2=pd.get_dummies(dfchange, columns=['Size'])

In [24]:
dfchange2.head()

Unnamed: 0,Company,Area,$100M_or_Less,$100M_to_1B,$1B_to_10B,More_than_$10B,Revenue_Unknown / Non-Applicable per year,Size_1,Size_2,Size_3,Size_4
0,J.P. Morgan,Finance,0,0,0,1,0,0,0,0,1
1,IBM,Technology,0,0,0,1,0,0,0,0,1
2,Citi,Finance,0,0,0,1,0,0,0,0,1
3,Macy's,Fashion/Retail,0,0,0,1,0,0,0,0,1
4,Verizon,Technology,0,0,0,1,0,0,0,0,1


In [25]:
#renaming the size columns now 
dfchange3 = dfchange2.rename(columns={'Size_1': 'Employees_1to1000', 'Size_2': 'Employees_1000to5000', 'Size_3': 'Employees_5000to10000','Size_4': 'Employees_10000+',})

In [26]:
dfchange3.head()

Unnamed: 0,Company,Area,$100M_or_Less,$100M_to_1B,$1B_to_10B,More_than_$10B,Revenue_Unknown / Non-Applicable per year,Employees_1to1000,Employees_1000to5000,Employees_5000to10000,Employees_10000+
0,J.P. Morgan,Finance,0,0,0,1,0,0,0,0,1
1,IBM,Technology,0,0,0,1,0,0,0,0,1
2,Citi,Finance,0,0,0,1,0,0,0,0,1
3,Macy's,Fashion/Retail,0,0,0,1,0,0,0,0,1
4,Verizon,Technology,0,0,0,1,0,0,0,0,1


#### Now do for Area (Industry)

In [135]:
dfchange3.columns

Index(['Company', 'Area', '$100M_or_Less', '$100M_to_1B', '$1B_to_10B',
       'More_than_$10B', 'Employees_1to1000', 'Employees_1000to5000',
       'Employees_5000to10000', 'Employees_10000+'],
      dtype='object')

In [27]:
dfchange['Area'] = dfchange['Area'].map(lambda x: x.strip())

In [28]:
dfchange['Area'].value_counts()

Goods & Svcs       160
Finance            103
Technology          64
Fashion/Retail      47
Health              45
Education           32
Entertainment       28
Govnt/Utilities     19
Name: Area, dtype: int64

In [29]:
# make into 8 categories
cleanup_nums3 = {"Area":     {"Goods & Svcs": 1, "Finance": 2, "Technology": 3, 
                               "Fashion/Retail": 4, "Health": 5,
                                "Education": 6, "Entertainment": 7,
                               "Govnt/Utilities": 8}}

In [30]:
dfchange3.replace(cleanup_nums3, inplace=True)
dfchange3.head()

Unnamed: 0,Company,Area,$100M_or_Less,$100M_to_1B,$1B_to_10B,More_than_$10B,Revenue_Unknown / Non-Applicable per year,Employees_1to1000,Employees_1000to5000,Employees_5000to10000,Employees_10000+
0,J.P. Morgan,2,0,0,0,1,0,0,0,0,1
1,IBM,3,0,0,0,1,0,0,0,0,1
2,Citi,2,0,0,0,1,0,0,0,0,1
3,Macy's,4,0,0,0,1,0,0,0,0,1
4,Verizon,3,0,0,0,1,0,0,0,0,1


In [31]:
#make each category into a column
dfchange4=pd.get_dummies(dfchange3, columns=['Area'])

In [32]:
dfchange4.columns

Index(['Company', '$100M_or_Less', '$100M_to_1B', '$1B_to_10B',
       'More_than_$10B', 'Revenue_Unknown / Non-Applicable per year',
       'Employees_1to1000', 'Employees_1000to5000', 'Employees_5000to10000',
       'Employees_10000+', 'Area_1', 'Area_2', 'Area_3', 'Area_4', 'Area_5',
       'Area_6', 'Area_7', 'Area_8'],
      dtype='object')

In [33]:
cleanup_nums3 = {"Area":     {"Goods & Svcs": 1, "Finance": 2, "Technology": 3, 
                               "Fashion/Retail": 4, "Health": 5,
                                "Education": 6, "Entertainment": 7,
                               "Govnt/Utilities": 8}}

In [34]:
#rename each column
dfchange5 = dfchange4.rename(columns={'Area_1': 'Goods & Svcs', 'Area_2': 'Finance', 'Area_3': 'Technology','Area_4': 'Fashion/Retail','Area_5': 'Health','Area_6': 'Education','Area_7': 'Entertainment','Area_8':'Govnt/Utilities'})

In [35]:
dfchange5.head()

Unnamed: 0,Company,$100M_or_Less,$100M_to_1B,$1B_to_10B,More_than_$10B,Revenue_Unknown / Non-Applicable per year,Employees_1to1000,Employees_1000to5000,Employees_5000to10000,Employees_10000+,Goods & Svcs,Finance,Technology,Fashion/Retail,Health,Education,Entertainment,Govnt/Utilities
0,J.P. Morgan,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0
1,IBM,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0
2,Citi,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0
3,Macy's,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0
4,Verizon,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0


In [None]:
#exploring clean dataset 

In [36]:
len(dfchange5.columns)

18

In [37]:
dfchange5.set_index('Company', inplace=True)

In [38]:
dfchange5.reset_index()

Unnamed: 0,Company,$100M_or_Less,$100M_to_1B,$1B_to_10B,More_than_$10B,Revenue_Unknown / Non-Applicable per year,Employees_1to1000,Employees_1000to5000,Employees_5000to10000,Employees_10000+,Goods & Svcs,Finance,Technology,Fashion/Retail,Health,Education,Entertainment,Govnt/Utilities
0,J.P. Morgan,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0
1,IBM,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0
2,Citi,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0
3,Macy's,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0
4,Verizon,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0
5,Morgan Stanley,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0
6,Goldman Sachs,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0
7,PwC,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0
8,EY,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0
9,Deloitte,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0


In [183]:
dfchange5.head()

Unnamed: 0_level_0,$100M_or_Less,$100M_to_1B,$1B_to_10B,More_than_$10B,Employees_1to1000,Employees_1000to5000,Employees_5000to10000,Employees_10000+,Goods & Svcs,Finance,Technology,Fashion/Retail,Health,Education,Entertainment,Govnt/Utilities
Company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
J.P. Morgan,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0
IBM,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0
Citi,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0
Macy's,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0
Verizon,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0


In [151]:
# saving dataset that is properly categorized now 
dfchange5.to_csv('jobs_clean.csv')

In [167]:
dfchange5.columns

Index(['Company', '$100M_or_Less', '$100M_to_1B', '$1B_to_10B',
       'More_than_$10B', 'Employees_1to1000', 'Employees_1000to5000',
       'Employees_5000to10000', 'Employees_10000+', 'Goods & Svcs', 'Finance',
       'Technology', 'Fashion/Retail', 'Health', 'Education', 'Entertainment',
       'Govnt/Utilities'],
      dtype='object')

### More cleaning before modeling 

In [173]:
#final_dataframe.csv file has overall rating and subratings added onto the above categorized datset 

In [128]:
df6=pd.read_csv('final_dataframe.csv')

In [129]:
#exploring dataset
df6.head()

Unnamed: 0.1,Unnamed: 0,Company,Overall,Culture & Values,Work/Life Balance,Sr. Management,Compensation & Benefits,Career Opportunities,$100M_or_Less,$100M_to_1B,...,Employees_5000to10000,Employees_10000+,Goods & Svcs,Finance,Technology,Fashion/Retail,Health,Education,Entertainment,Govnt/Utilities
0,0,J.P. Morgan,3.9,3.8,3.5,3.3,3.8,3.7,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,IBM,3.7,3.7,3.8,3.0,3.1,3.5,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,2,Citi,3.6,3.5,3.3,3.1,3.6,3.4,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,Macy's,3.2,3.2,3.1,2.7,2.8,2.9,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,4,Verizon,3.6,3.4,3.2,2.9,4.2,3.4,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [41]:
df6.columns

Index(['Unnamed: 0', 'Company', 'Overall', 'Culture & Values',
       'Work/Life Balance', 'Sr. Management', 'Compensation & Benefits',
       'Career Opportunities', '$100M_or_Less', '$100M_to_1B', '$1B_to_10B',
       'More_than_$10B', 'Employees_1to1000', 'Employees_1000to5000',
       'Employees_5000to10000', 'Employees_10000+', 'Goods & Svcs', 'Finance',
       'Technology', 'Fashion/Retail', 'Health', 'Education', 'Entertainment',
       'Govnt/Utilities'],
      dtype='object')

In [130]:
#dropping unnecessary column
df6.drop(['Unnamed: 0'], axis=1, inplace=True)

In [131]:
df6.head()

Unnamed: 0,Company,Overall,Culture & Values,Work/Life Balance,Sr. Management,Compensation & Benefits,Career Opportunities,$100M_or_Less,$100M_to_1B,$1B_to_10B,...,Employees_5000to10000,Employees_10000+,Goods & Svcs,Finance,Technology,Fashion/Retail,Health,Education,Entertainment,Govnt/Utilities
0,J.P. Morgan,3.9,3.8,3.5,3.3,3.8,3.7,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,IBM,3.7,3.7,3.8,3.0,3.1,3.5,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,Citi,3.6,3.5,3.3,3.1,3.6,3.4,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Macy's,3.2,3.2,3.1,2.7,2.8,2.9,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,Verizon,3.6,3.4,3.2,2.9,4.2,3.4,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [44]:
#shouldnt have nulls...
df6.isnull().sum()

Company                    0
Overall                    2
Culture & Values           2
Work/Life Balance          2
Sr. Management             2
Compensation & Benefits    2
Career Opportunities       2
$100M_or_Less              4
$100M_to_1B                4
$1B_to_10B                 4
More_than_$10B             4
Employees_1to1000          4
Employees_1000to5000       4
Employees_5000to10000      4
Employees_10000+           4
Goods & Svcs               4
Finance                    4
Technology                 4
Fashion/Retail             4
Health                     4
Education                  4
Entertainment              4
Govnt/Utilities            4
dtype: int64

In [180]:
df6.head()

Unnamed: 0,Company,Overall,Culture & Values,Work/Life Balance,Sr. Management,Compensation & Benefits,Career Opportunities,$100M_or_Less,$100M_to_1B,$1B_to_10B,...,Employees_5000to10000,Employees_10000+,Goods & Svcs,Finance,Technology,Fashion/Retail,Health,Education,Entertainment,Govnt/Utilities
0,J.P. Morgan,3.9,3.8,3.5,3.3,3.8,3.7,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,IBM,3.7,3.7,3.8,3.0,3.1,3.5,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,Citi,3.6,3.5,3.3,3.1,3.6,3.4,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Macy's,3.2,3.2,3.1,2.7,2.8,2.9,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,Verizon,3.6,3.4,3.2,2.9,4.2,3.4,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [181]:
# need to create dataframe which doesnt have companies listed and instead a groupID so can put into model 
# df6 will serve as reference dataframe at the end.

In [182]:
# Adding GroupID 

In [45]:
# Adding GroupID 
df6["GroupId"] = df6.groupby(['Company']).grouper.label_info

In [46]:
df8=df6.set_index('GroupId')

In [47]:
df9=df8.drop(['Company'], axis=1)

In [48]:
df9.head()

Unnamed: 0_level_0,Overall,Culture & Values,Work/Life Balance,Sr. Management,Compensation & Benefits,Career Opportunities,$100M_or_Less,$100M_to_1B,$1B_to_10B,More_than_$10B,...,Employees_5000to10000,Employees_10000+,Goods & Svcs,Finance,Technology,Fashion/Retail,Health,Education,Entertainment,Govnt/Utilities
GroupId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
229,3.9,3.8,3.5,3.3,3.8,3.7,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
214,3.7,3.7,3.8,3.0,3.1,3.5,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
105,3.6,3.5,3.3,3.1,3.6,3.4,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
252,3.2,3.2,3.1,2.7,2.8,2.9,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
465,3.6,3.4,3.2,2.9,4.2,3.4,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [49]:
df6.head()

Unnamed: 0,Company,Overall,Culture & Values,Work/Life Balance,Sr. Management,Compensation & Benefits,Career Opportunities,$100M_or_Less,$100M_to_1B,$1B_to_10B,...,Employees_10000+,Goods & Svcs,Finance,Technology,Fashion/Retail,Health,Education,Entertainment,Govnt/Utilities,GroupId
0,J.P. Morgan,3.9,3.8,3.5,3.3,3.8,3.7,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,229
1,IBM,3.7,3.7,3.8,3.0,3.1,3.5,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,214
2,Citi,3.6,3.5,3.3,3.1,3.6,3.4,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,105
3,Macy's,3.2,3.2,3.1,2.7,2.8,2.9,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,252
4,Verizon,3.6,3.4,3.2,2.9,4.2,3.4,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,465


In [50]:
#df10 is dataframe we put into model
df10=df9.reset_index()

In [51]:
df11=df9.reset_index()

In [52]:
df12=df9

In [53]:
df10.head()

Unnamed: 0,GroupId,Overall,Culture & Values,Work/Life Balance,Sr. Management,Compensation & Benefits,Career Opportunities,$100M_or_Less,$100M_to_1B,$1B_to_10B,...,Employees_5000to10000,Employees_10000+,Goods & Svcs,Finance,Technology,Fashion/Retail,Health,Education,Entertainment,Govnt/Utilities
0,229,3.9,3.8,3.5,3.3,3.8,3.7,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,214,3.7,3.7,3.8,3.0,3.1,3.5,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,105,3.6,3.5,3.3,3.1,3.6,3.4,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,252,3.2,3.2,3.1,2.7,2.8,2.9,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,465,3.6,3.4,3.2,2.9,4.2,3.4,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [54]:
df11=df10.drop(['GroupId'], axis=1)

In [55]:
df20=df10

In [201]:
# Cant use surprise library for determining similarity  and cant use sklearn for standardizing bc not doing 
#traintestsplit

#### Now we normalize and replace nulls 

In [61]:
df10.head()

Unnamed: 0,GroupId,Overall,Culture & Values,Work/Life Balance,Sr. Management,Compensation & Benefits,Career Opportunities,$100M_or_Less,$100M_to_1B,$1B_to_10B,...,Employees_5000to10000,Employees_10000+,Goods & Svcs,Finance,Technology,Fashion/Retail,Health,Education,Entertainment,Govnt/Utilities
0,229,3.9,3.8,3.5,3.3,3.8,3.7,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,214,3.7,3.7,3.8,3.0,3.1,3.5,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,105,3.6,3.5,3.3,3.1,3.6,3.4,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,252,3.2,3.2,3.1,2.7,2.8,2.9,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,465,3.6,3.4,3.2,2.9,4.2,3.4,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [62]:
df14=df10

In [70]:
#def preprocess2(df):
    
    normal = ['GroupId', 'Overall', 'Culture & Values', 'Work/Life Balance',
       'Sr. Management', 'Compensation & Benefits', 'Career Opportunities',
       '$100M_or_Less', '$100M_to_1B', '$1B_to_10B', 'More_than_$10B',
       'Employees_1to1000', 'Employees_1000to5000', 'Employees_5000to10000',
       'Employees_10000+', 'Goods & Svcs', 'Finance', 'Technology',
       'Fashion/Retail', 'Health', 'Education', 'Entertainment',
       'Govnt/Utilities']
    minmax = MinMaxScaler()
    df[normal] = minmax.fit_transform(df[normal])
    df.fillna(df.mean())
    return df

In [230]:
# preprocess2(df14)

Unnamed: 0,GroupId,Overall,Culture & Values,Work/Life Balance,Sr. Management,Compensation & Benefits,Career Opportunities,$100M_or_Less,$100M_to_1B,$1B_to_10B,...,Employees_5000to10000,Employees_10000+,Goods & Svcs,Finance,Technology,Fashion/Retail,Health,Education,Entertainment,Govnt/Utilities
0,0.457086,0.666667,0.645161,0.56,0.533333,0.666667,0.642857,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.427146,0.583333,0.612903,0.68,0.433333,0.407407,0.571429,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.209581,0.541667,0.548387,0.48,0.466667,0.592593,0.535714,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.502994,0.375000,0.451613,0.40,0.333333,0.296296,0.357143,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.928144,0.541667,0.516129,0.44,0.400000,0.814815,0.535714,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,0.576846,0.583333,0.645161,0.52,0.566667,0.555556,0.607143,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.373253,0.666667,0.645161,0.28,0.566667,0.703704,0.714286,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.706587,0.625000,0.612903,0.32,0.566667,0.444444,0.750000,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.289421,0.583333,0.612903,0.32,0.533333,0.444444,0.714286,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.261477,0.625000,0.645161,0.36,0.566667,0.518519,0.750000,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [224]:
df13.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 502 entries, 0 to 501
Data columns (total 23 columns):
GroupId                    502 non-null float64
Overall                    502 non-null float64
Culture & Values           502 non-null float64
Work/Life Balance          502 non-null float64
Sr. Management             502 non-null float64
Compensation & Benefits    502 non-null float64
Career Opportunities       502 non-null float64
$100M_or_Less              502 non-null float64
$100M_to_1B                502 non-null float64
$1B_to_10B                 502 non-null float64
More_than_$10B             502 non-null float64
Employees_1to1000          502 non-null float64
Employees_1000to5000       502 non-null float64
Employees_5000to10000      502 non-null float64
Employees_10000+           502 non-null float64
Goods & Svcs               502 non-null float64
Finance                    502 non-null float64
Technology                 502 non-null float64
Fashion/Retail             502 

In [66]:
#Function to normalize and replace nulls
def preprocess(df):
    
    relevant = ['GroupId', 'Overall', 'Culture & Values', 'Work/Life Balance',
       'Sr. Management', 'Compensation & Benefits', 'Career Opportunities',
       '$100M_or_Less', '$100M_to_1B', '$1B_to_10B', 'More_than_$10B',
       'Employees_1to1000', 'Employees_1000to5000', 'Employees_5000to10000',
       'Employees_10000+', 'Goods & Svcs', 'Finance', 'Technology',
       'Fashion/Retail', 'Health', 'Education', 'Entertainment',
       'Govnt/Utilities']
    
#     outliers = ['published','avgrating','best','maxplayers','maxplaytime','minplaytime','not_recommended','playingtime','ratingscount','recommended']
    normal = ['GroupId', 'Overall', 'Culture & Values', 'Work/Life Balance',
       'Sr. Management', 'Compensation & Benefits', 'Career Opportunities',
       '$100M_or_Less', '$100M_to_1B', '$1B_to_10B', 'More_than_$10B',
       'Employees_1to1000', 'Employees_1000to5000', 'Employees_5000to10000',
       'Employees_10000+', 'Goods & Svcs', 'Finance', 'Technology',
       'Fashion/Retail', 'Health', 'Education', 'Entertainment',
       'Govnt/Utilities']
    
    df[relevant] = df[relevant].apply(lambda x: x.fillna(x.median()) if x.dtype != np.dtype('O') else x,axis=0)
    
    robust = RobustScaler()
    #df[outliers] = robust.fit_transform(df[outliers])
    minmax = MinMaxScaler()
    df[normal] = minmax.fit_transform(df[normal])
    
    return df

In [278]:
# preprocess(df10)

  return self.partial_fit(X, y)


Unnamed: 0,GroupId,Overall,Culture & Values,Work/Life Balance,Sr. Management,Compensation & Benefits,Career Opportunities,$100M_or_Less,$100M_to_1B,$1B_to_10B,...,Employees_5000to10000,Employees_10000+,Goods & Svcs,Finance,Technology,Fashion/Retail,Health,Education,Entertainment,Govnt/Utilities
0,0.457086,0.666667,0.645161,0.56,0.533333,0.666667,0.642857,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.427146,0.583333,0.612903,0.68,0.433333,0.407407,0.571429,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.209581,0.541667,0.548387,0.48,0.466667,0.592593,0.535714,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.502994,0.375000,0.451613,0.40,0.333333,0.296296,0.357143,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.928144,0.541667,0.516129,0.44,0.400000,0.814815,0.535714,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,0.576846,0.583333,0.645161,0.52,0.566667,0.555556,0.607143,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.373253,0.666667,0.645161,0.28,0.566667,0.703704,0.714286,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.706587,0.625000,0.612903,0.32,0.566667,0.444444,0.750000,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.289421,0.583333,0.612903,0.32,0.533333,0.444444,0.714286,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.261477,0.625000,0.645161,0.36,0.566667,0.518519,0.750000,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


### Can do modeling now to assess similarities across companies

In [None]:
# have to use unsupervised modeling approach to assess similarties across companies.Dont have target/label 
# in feature based

In [379]:
# nn = NearestNeighbors(metric='cosine')
# nn.fit(df20) 

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [217]:
distances, indices = neigh.kneighbors(dfchange5)

In [218]:
indices

array([[186, 218,  33, 396, 201],
       [359, 493,  59, 211,  34],
       [186, 218,  33, 396, 201],
       ...,
       [125, 284,  31, 464,  39],
       [403, 334, 496, 299, 440],
       [232, 261, 188, 305, 497]])

In [220]:
neigh.kneighbors_graph(dfchange5).toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

#### Create functions 

In [91]:
df6.columns

Index(['Company', 'Overall', 'Culture & Values', 'Work/Life Balance',
       'Sr. Management', 'Compensation & Benefits', 'Career Opportunities',
       '$100M_or_Less', '$100M_to_1B', '$1B_to_10B', 'More_than_$10B',
       'Employees_1to1000', 'Employees_1000to5000', 'Employees_5000to10000',
       'Employees_10000+', 'Goods & Svcs', 'Finance', 'Technology',
       'Fashion/Retail', 'Health', 'Education', 'Entertainment',
       'Govnt/Utilities', 'GroupId'],
      dtype='object')

In [65]:
def dropcols(df):
    to_drop = ['GroupId', 'Company']
    return df.drop(to_drop, axis=1)

In [None]:
#used dropcols within running model

Ended up not using this one 

In [100]:
nn = NearestNeighbors(n_neighbors=5)
nn.fit(dropcols(preprocess(df6)))

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=None, n_neighbors=5, p=2, radius=1.0)

In [64]:
df20.head()

Unnamed: 0,GroupId,Overall,Culture & Values,Work/Life Balance,Sr. Management,Compensation & Benefits,Career Opportunities,$100M_or_Less,$100M_to_1B,$1B_to_10B,...,Employees_5000to10000,Employees_10000+,Goods & Svcs,Finance,Technology,Fashion/Retail,Health,Education,Entertainment,Govnt/Utilities
0,0.457086,0.666667,0.645161,0.56,0.533333,0.666667,0.642857,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.427146,0.583333,0.612903,0.68,0.433333,0.407407,0.571429,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.209581,0.541667,0.548387,0.48,0.466667,0.592593,0.535714,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.502994,0.375,0.451613,0.4,0.333333,0.296296,0.357143,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.928144,0.541667,0.516129,0.44,0.4,0.814815,0.535714,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [88]:
def get_test_array(df, ids):
    processed = preprocess(df)
    inputs = dropcols(processed[processed['GroupId'].isin(ids)])
    return inputs.mean().values.reshape(1, -1)

In [85]:
def get_nearest(df, ids, n=5):
    input_array = get_test_array(df, ids)
#    print(input_array.shape)
#    df = df.reset_index()
    nearest = nn.kneighbors(input_array, n)[-1]
    results = nearest.tolist()[0]
    return results

In [84]:
def find_company(df, ids):
    return df.iloc[ids, :]

In [84]:
get_test_array(df20, test)

array([[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan]])

This is how I normalized dataframe so all columns are on same scale
Not the 'preprocess' function created. 

In [71]:
normal = ['Overall', 'Culture & Values', 'Work/Life Balance',
       'Sr. Management', 'Compensation & Benefits', 'Career Opportunities',
       '$100M_or_Less', '$100M_to_1B', '$1B_to_10B', 'More_than_$10B',
       'Employees_1to1000', 'Employees_1000to5000', 'Employees_5000to10000',
       'Employees_10000+', 'Goods & Svcs', 'Finance', 'Technology',
       'Fashion/Retail', 'Health', 'Education', 'Entertainment',
       'Govnt/Utilities']
minmax = MinMaxScaler()
df11[normal] = minmax.fit_transform(df11[normal])
df11.replace([np.inf, -np.inf], np.nan)
df11.fillna(0)

Unnamed: 0,Overall,Culture & Values,Work/Life Balance,Sr. Management,Compensation & Benefits,Career Opportunities,$100M_or_Less,$100M_to_1B,$1B_to_10B,More_than_$10B,...,Employees_5000to10000,Employees_10000+,Goods & Svcs,Finance,Technology,Fashion/Retail,Health,Education,Entertainment,Govnt/Utilities
0,0.666667,0.645161,0.56,0.533333,0.666667,0.642857,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.583333,0.612903,0.68,0.433333,0.407407,0.571429,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.541667,0.548387,0.48,0.466667,0.592593,0.535714,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.375000,0.451613,0.40,0.333333,0.296296,0.357143,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.541667,0.516129,0.44,0.400000,0.814815,0.535714,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,0.583333,0.645161,0.52,0.566667,0.555556,0.607143,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.666667,0.645161,0.28,0.566667,0.703704,0.714286,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.625000,0.612903,0.32,0.566667,0.444444,0.750000,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.583333,0.612903,0.32,0.533333,0.444444,0.714286,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.625000,0.645161,0.36,0.566667,0.518519,0.750000,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


#### CHECKPOINT

In [None]:
# Lets clean up now 


In [72]:
df11.head()

Unnamed: 0,Overall,Culture & Values,Work/Life Balance,Sr. Management,Compensation & Benefits,Career Opportunities,$100M_or_Less,$100M_to_1B,$1B_to_10B,More_than_$10B,...,Employees_5000to10000,Employees_10000+,Goods & Svcs,Finance,Technology,Fashion/Retail,Health,Education,Entertainment,Govnt/Utilities
0,0.666667,0.645161,0.56,0.533333,0.666667,0.642857,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.583333,0.612903,0.68,0.433333,0.407407,0.571429,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.541667,0.548387,0.48,0.466667,0.592593,0.535714,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.375,0.451613,0.4,0.333333,0.296296,0.357143,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.541667,0.516129,0.44,0.4,0.814815,0.535714,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [74]:
df11=df11.replace([np.inf, -np.inf], np.nan)


In [75]:
df11=df11.fillna(0)

In [76]:
df11.isna().sum()

Overall                    0
Culture & Values           0
Work/Life Balance          0
Sr. Management             0
Compensation & Benefits    0
Career Opportunities       0
$100M_or_Less              0
$100M_to_1B                0
$1B_to_10B                 0
More_than_$10B             0
Employees_1to1000          0
Employees_1000to5000       0
Employees_5000to10000      0
Employees_10000+           0
Goods & Svcs               0
Finance                    0
Technology                 0
Fashion/Retail             0
Health                     0
Education                  0
Entertainment              0
Govnt/Utilities            0
dtype: int64

#### Testing NN algorithm now 

In [77]:
nn = NearestNeighbors(metric='cosine')
nn.fit(df11) 

NearestNeighbors(algorithm='auto', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=None, n_neighbors=5, p=2, radius=1.0)

In [78]:
test=[400,20,3]

In [79]:
dftest=df11.iloc[test,:]

In [80]:
dftest= dftest.mean().values.reshape(1, -1)

In [168]:
test.shape

(1, 1)

In [81]:
#getting nearest 
nearest = nn.kneighbors(dftest, n_neighbors=5)[-1]
results = nearest.tolist()[0]


In [218]:
print(results)

[137, 176, 20, 262, 205]


In [None]:
# GET RESULTS...FINALLY ! 

In [82]:
test=[400,17,2]

In [95]:
find_company(df6, test)

Unnamed: 0,Company,Overall,Culture & Values,Work/Life Balance,Sr. Management,Compensation & Benefits,Career Opportunities,$100M_or_Less,$100M_to_1B,$1B_to_10B,...,Employees_10000+,Goods & Svcs,Finance,Technology,Fashion/Retail,Health,Education,Entertainment,Govnt/Utilities,GroupId
400,Fidelis Care,0.25,0.322581,0.36,0.266667,0.518519,0.357143,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.335329
17,NYU (New York University),0.791667,0.709677,0.8,0.666667,0.666667,0.642857,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.592814
2,Citi,0.541667,0.548387,0.48,0.466667,0.592593,0.535714,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.209581


In [90]:
resultt = [137, 176, 20, 262, 205]

In [92]:
testt=[400,20,3]

In [93]:
find_company(df6, testt)

Unnamed: 0,Company,Overall,Culture & Values,Work/Life Balance,Sr. Management,Compensation & Benefits,Career Opportunities,$100M_or_Less,$100M_to_1B,$1B_to_10B,...,Employees_10000+,Goods & Svcs,Finance,Technology,Fashion/Retail,Health,Education,Entertainment,Govnt/Utilities,GroupId
400,Fidelis Care,0.25,0.322581,0.36,0.266667,0.518519,0.357143,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.335329
20,Johnson & Johnson,0.708333,0.741935,0.68,0.566667,0.740741,0.607143,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.46507
3,Macy's,0.375,0.451613,0.4,0.333333,0.296296,0.357143,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.502994


In [91]:
find_company(df6, resultt)

Unnamed: 0,Company,Overall,Culture & Values,Work/Life Balance,Sr. Management,Compensation & Benefits,Career Opportunities,$100M_or_Less,$100M_to_1B,$1B_to_10B,...,Employees_10000+,Goods & Svcs,Finance,Technology,Fashion/Retail,Health,Education,Entertainment,Govnt/Utilities,GroupId
137,Trader Joe's,0.791667,0.806452,0.68,0.7,0.814815,0.642857,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.886228
176,Delta Air Lines,0.791667,0.741935,0.64,0.666667,0.814815,0.75,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.263473
20,Johnson & Johnson,0.708333,0.741935,0.68,0.566667,0.740741,0.607143,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.46507
262,Bayer,0.708333,0.677419,0.76,0.533333,0.740741,0.571429,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.123752
205,Stryker,0.666667,0.709677,0.6,0.6,0.703704,0.678571,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.824351


In [None]:
# LETS TRY FOR ANOTHER 

In [94]:
df6.head()

Unnamed: 0,Company,Overall,Culture & Values,Work/Life Balance,Sr. Management,Compensation & Benefits,Career Opportunities,$100M_or_Less,$100M_to_1B,$1B_to_10B,...,Employees_10000+,Goods & Svcs,Finance,Technology,Fashion/Retail,Health,Education,Entertainment,Govnt/Utilities,GroupId
0,J.P. Morgan,0.666667,0.645161,0.56,0.533333,0.666667,0.642857,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.457086
1,IBM,0.583333,0.612903,0.68,0.433333,0.407407,0.571429,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.427146
2,Citi,0.541667,0.548387,0.48,0.466667,0.592593,0.535714,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.209581
3,Macy's,0.375,0.451613,0.4,0.333333,0.296296,0.357143,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.502994
4,Verizon,0.541667,0.516129,0.44,0.4,0.814815,0.535714,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.928144


In [99]:
test2= [1,2,4]

In [100]:
dftest2=df11.iloc[test2,:]

In [101]:
dftest2= dftest2.mean().values.reshape(1, -1)

In [102]:
dftest2.shape

(1, 22)

#### Cosine Similarity Score 

In [None]:
nn = NearestNeighbors(metric='cosine')
nn.fit(df11) 

In [104]:
#getting nearest 
nearest = nn.kneighbors(dftest2, n_neighbors=5)[-1]
results22 = nearest.tolist()[0]

In [106]:
print(results22)

[147, 58, 34, 81, 181]


In [132]:
find_company(df6, test2)

Unnamed: 0,Company,Overall,Culture & Values,Work/Life Balance,Sr. Management,Compensation & Benefits,Career Opportunities,$100M_or_Less,$100M_to_1B,$1B_to_10B,...,Employees_5000to10000,Employees_10000+,Goods & Svcs,Finance,Technology,Fashion/Retail,Health,Education,Entertainment,Govnt/Utilities
1,IBM,3.7,3.7,3.8,3.0,3.1,3.5,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,Citi,3.6,3.5,3.3,3.1,3.6,3.4,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Verizon,3.6,3.4,3.2,2.9,4.2,3.4,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [108]:
resultfor2=[147, 58, 34, 81, 181]

In [133]:
find_company(df6, resultfor2)

Unnamed: 0,Company,Overall,Culture & Values,Work/Life Balance,Sr. Management,Compensation & Benefits,Career Opportunities,$100M_or_Less,$100M_to_1B,$1B_to_10B,...,Employees_5000to10000,Employees_10000+,Goods & Svcs,Finance,Technology,Fashion/Retail,Health,Education,Entertainment,Govnt/Utilities
147,Honeywell,3.9,3.7,3.6,3.4,3.7,3.7,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
58,Amazon,3.8,3.8,3.4,3.4,3.9,3.8,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
34,Apple,3.9,4.0,3.3,3.4,4.1,3.5,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
81,T-Mobile,4.0,4.2,3.8,3.6,4.2,3.8,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
181,Microsoft,4.2,4.1,3.9,3.6,4.1,3.9,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


#### Now for Recommender with Minkowski with metric 

In [110]:
nn = NearestNeighbors(metric='minkowski')
nn.fit(df11) 

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=None, n_neighbors=5, p=2, radius=1.0)

In [111]:
#getting nearest 
nearest = nn.kneighbors(dftest2, n_neighbors=5)[-1]
resultsMinkow = nearest.tolist()[0]

In [112]:
print(resultsMinkow)

[416, 192, 147, 4, 58]


In [113]:
resultsMinkowget= [416, 192, 147, 4, 58]

In [134]:
find_company(df6, resultsMinkowget)

Unnamed: 0,Company,Overall,Culture & Values,Work/Life Balance,Sr. Management,Compensation & Benefits,Career Opportunities,$100M_or_Less,$100M_to_1B,$1B_to_10B,...,Employees_5000to10000,Employees_10000+,Goods & Svcs,Finance,Technology,Fashion/Retail,Health,Education,Entertainment,Govnt/Utilities
416,Ericsson-Worldwide,3.5,3.5,3.5,2.8,3.7,3.1,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
192,Capgemini,3.4,3.5,3.4,3.1,3.3,3.4,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
147,Honeywell,3.9,3.7,3.6,3.4,3.7,3.7,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,Verizon,3.6,3.4,3.2,2.9,4.2,3.4,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
58,Amazon,3.8,3.8,3.4,3.4,3.9,3.8,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


#### Now Euclidean metric for recommender system

In [115]:
nn = NearestNeighbors(metric='euclidean')
nn.fit(df11) 

NearestNeighbors(algorithm='auto', leaf_size=30, metric='euclidean',
         metric_params=None, n_jobs=None, n_neighbors=5, p=2, radius=1.0)

In [117]:
#getting nearest 
nearest = nn.kneighbors(dftest2, n_neighbors=5)[-1]
resultsEuc = nearest.tolist()[0]

In [118]:
print(resultsEuc)

[416, 192, 147, 4, 58]


In [119]:
resultsEucget=[416, 192, 147, 4, 58]

In [135]:
find_company(df6, resultsEucget)

Unnamed: 0,Company,Overall,Culture & Values,Work/Life Balance,Sr. Management,Compensation & Benefits,Career Opportunities,$100M_or_Less,$100M_to_1B,$1B_to_10B,...,Employees_5000to10000,Employees_10000+,Goods & Svcs,Finance,Technology,Fashion/Retail,Health,Education,Entertainment,Govnt/Utilities
416,Ericsson-Worldwide,3.5,3.5,3.5,2.8,3.7,3.1,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
192,Capgemini,3.4,3.5,3.4,3.1,3.3,3.4,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
147,Honeywell,3.9,3.7,3.6,3.4,3.7,3.7,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,Verizon,3.6,3.4,3.2,2.9,4.2,3.4,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
58,Amazon,3.8,3.8,3.4,3.4,3.9,3.8,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


#### THIS IS THE END , Get company names / nearest with 'find_company' 

In [None]:
# Improvements include getting more companies, adding more features. Top5 predicted companies arent that good. 

In [53]:
company_list = list(df6.Company.unique())
print('Here is the list of available companies: ', company_list)

Here is the list of available companies:  ['J.P. Morgan', 'IBM', 'Citi', "Macy's", 'Verizon', 'Morgan Stanley', 'Goldman Sachs', 'PwC', 'EY', 'Deloitte', 'Cognizant Technology Solutions', 'Bloomberg L.P.', 'Bank of America', 'American Express', 'Columbia University', 'AIG', 'New York City Department of Education', 'NYU (New York University)', 'KPMG', 'Viacom', 'Johnson & Johnson', 'Rutgers University', 'UBS', 'Target', 'Credit Suisse', 'Thomson Reuters', 'Starbucks', 'Barclays', 'AT&T', 'BNY Mellon', 'The Home Depot', 'Mount Sinai Health System', 'CVS Health', 'Deutsche Bank', 'Apple', 'Accenture', 'NBCUniversal', 'Northwell Health', 'NewYork-Presbyterian Hospital', 'Memorial Sloan Kettering', 'UPS', 'Best Buy', 'Prudential', 'NYU Langone Health', 'ADP', "Bloomingdale's", 'Pfizer', 'New York Life', 'Merck', 'Chase', 'Ralph Lauren', 'TD', 'Toys "R" Us', "McDonald's", 'Staples', 'MetLife', 'Shoprite Supermarkets', 'PepsiCo', 'Amazon', "L'Oréal", 'HSBC Holdings', 'Google', 'Cablevision Sy

In [86]:
get_nearest(df10, [0,1,2,3,4,5,6], n=10)

[424, 364, 341, 44, 304, 232, 15, 495, 189, 190]

In [89]:

# num_recs = input('How many recommendations would you like? Please enter a number from 1 to 10:\n')

# while int(num_recs) > 10:
#     num_recs = input('You entered a number over 10. Please enter a number from 1 to 10 to continue. \n')

# input_list = []
# num_of_inputs = input('How many companies to compare with? ')
# i = 0
# while i < int(num_of_inputs):
#     temp = input('Company #'+str(i+1)+': ')
#     if temp in company_list:
#         input_list.append(temp)
#         i +=1
#     else:
#         print('Does not match companies list, please type again.')

# print('\n'+'-----Company Recommendations-----'+'\n')

# input_nums_list = []
# for i in input_list:
#     input_nums_list.append(df6.index[df6['Company'] == i].tolist())
output = get_nearest(df10, [0,1,2,3,4,5,6], n=10) # , n=num_recs
# output = get_nearest(df10, input_nums_list, n=num_recs) # , n=num_recs
# get_recommendations(input_list, int(num_recs))

df6.iloc[output,:]

Unnamed: 0,Company,Overall,Culture & Values,Work/Life Balance,Sr. Management,Compensation & Benefits,Career Opportunities,$100M_or_Less,$100M_to_1B,$1B_to_10B,...,Employees_10000+,Goods & Svcs,Finance,Technology,Fashion/Retail,Health,Education,Entertainment,Govnt/Utilities,GroupId
424,7-Eleven,3.1,2.9,2.8,2.7,2.8,2.8,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2
364,360i,3.0,3.2,3.1,2.6,3.3,2.9,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0
341,5W Public Relations,3.9,3.7,3.5,3.7,3.8,3.9,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
44,ADP,3.7,3.8,3.7,3.4,3.7,3.6,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
304,A+E Networks,2.9,2.7,3.4,2.3,3.2,2.4,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3
232,AECOM,3.2,3.1,3.2,2.7,3.2,3.1,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
15,AIG,3.1,2.9,3.5,2.5,3.5,2.8,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,7
495,AHRC New York City,2.9,3.1,2.9,2.4,2.7,2.6,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6
189,AMC Entertainment,3.2,3.2,3.1,2.9,2.7,2.8,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,9
190,AOL,3.7,3.8,3.9,3.1,3.8,3.1,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10
