# Load Data

In [3]:
# Code to read csv file into Colaboratory:
!pip install -U -q PyDrive
# !pip install pandas==0.24.1

[?25l[K    1% |▎                               | 10kB 18.3MB/s eta 0:00:01[K    2% |▋                               | 20kB 1.7MB/s eta 0:00:01[K    3% |█                               | 30kB 2.5MB/s eta 0:00:01[K    4% |█▎                              | 40kB 1.7MB/s eta 0:00:01[K    5% |█▋                              | 51kB 2.0MB/s eta 0:00:01[K    6% |██                              | 61kB 2.4MB/s eta 0:00:01[K    7% |██▎                             | 71kB 2.8MB/s eta 0:00:01[K    8% |██▋                             | 81kB 3.1MB/s eta 0:00:01[K    9% |███                             | 92kB 3.5MB/s eta 0:00:01[K    10% |███▎                            | 102kB 2.7MB/s eta 0:00:01[K    11% |███▋                            | 112kB 2.7MB/s eta 0:00:01[K    12% |████                            | 122kB 3.9MB/s eta 0:00:01[K    13% |████▎                           | 133kB 3.9MB/s eta 0:00:01[K    14% |████▋                           | 143kB 7.2MB/s eta 0:00:01[

In [0]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
pd.options.display.max_columns = 100
from sklearn.model_selection import train_test_split


link = 'https://drive.google.com/open?id=1G9OJtX53sYWipSayEojSWiE9DF3_mrJ7'
fluff, id = link.split('=')
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('profiles.csv')  
profiles = pd.read_csv('profiles.csv')


In [0]:
def age(i):
    if i >= 20 and i < 30:
        return '20s'
    elif i >= 30 and i < 40:
        return '30s'
    elif i >= 40 and i < 50:
        return '40s'
    elif i >= 50 and i < 60:
        return '50s'
    else:
         return '60s'
        
def height(i):
    if i <= 48:
        return '4ft_less'
    elif i <= 60:
        return '5ft_less'
    elif i <= 72:
        return '6ft_less'
    else:
        return 'more_6ft'

profiles.loc[:,'age'] = [age(i) for i in profiles.loc[:,'age']]
profiles.loc[:,'height'] = [height(i) for i in profiles.loc[:,'height']]


In [0]:
# remove speaks
categorical_columns = ['age',
                       'height',
                       'body_type',
                      'diet',
                      'drinks',
                      'drugs',
                      'education',
                      'ethnicity',
                      'job',
                      'offspring',
                      'orientation',
                      'pets',
                      'religion',
                       'sign',
                       'sex',
                       'smokes',
                       'status']


profiles2 = pd.get_dummies(profiles, 
                                columns = categorical_columns)  

profiles_train, profiles_test = train_test_split(profiles2, test_size=0.2, random_state=5)

In [0]:
# sex = profiles_train.loc[:,'sex']
# sex_test = profiles_test.loc[:,'sex']

profiles_train = profiles_train.drop(columns=['income'])
profiles_test = profiles_test.drop(columns=['income'])


sub_columns = [c for c in profiles_train.columns.values if 
              (profiles_train[c].dtype == np.uint8) or 
               (profiles_train[c].dtype == np.int64) or
               (profiles_train[c].dtype == np.float64)]



profiles_num = profiles_train[sub_columns]
profiles_num = profiles_num.dropna()


profiles_num_test = profiles_test[sub_columns]
profiles_num_test = profiles_num_test.dropna()


# PCA

In [0]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


profiles_scale = StandardScaler().fit_transform(profiles_num)

pca = PCA(n_components=5).fit(profiles_scale)
transformed = pca.transform(profiles_scale)

plt.figure()
plt.plot(pca.explained_variance_)
plt.xlabel('Component Number')
plt.ylabel('Explained Variance');
plt.title('PCA Variance Explained')

print(pca.components_[0])
display_topics(pca, profiles_num.columns, 10)
print(pca.score(profiles_num))
print(pca.score(profiles_num_test))

component_m = []
for person in transformed:
    index = np.argmax(person)
    component_m.append(index)

component_count = np.bincount(component_m)
plt.figure()
plt.bar(range(5),component_count)
plt.xlabel("Component #")
plt.ylabel("Number of people in component")

plt.show()

# FA

In [0]:
# Factor analysis
from sklearn.decomposition import FactorAnalysis

k = 5

transformer = FactorAnalysis(n_components=k, random_state=0)
profiles_transformed = transformer.fit_transform(profiles_num)

# num observations x k
Z_factor = profiles_transformed

# k x num features
lambda_loading = transformer.components_

print(lambda_loading.shape)
print(Z_factor.shape)
print(transformer.score(profiles_num))
print(transformer.score(profiles_num_test))

In [0]:
scores = transformer.score_samples(profiles_num_test).tolist()
scores_no_outlier = sorted(scores)
print(scores_no_outlier)

In [0]:
display_topics(transformer, profiles_num.columns, 10)

In [0]:
profiles_num.head()
profiles_num_test.head()

# profiles_num_test1 = profiles_num_test.drop([31494])

print(transformer.score_samples(profiles_num_test)[:50])
print(profiles_num_test.loc[31494,:])


In [0]:
noise = transformer.noise_variance_

m1 = lambda_loading**2
m2 = np.sum(m1,axis=1)

pvar = []
for i in range(len(m2)):
    pvar.append((100*m2[i])/np.sum(m2))

print(pvar)    
plt.plot(pvar)
  
pvar_sort = sorted(pvar, reverse=True)
plt.figure()
plt.plot(np.cumsum(pvar))

# test_transformed = transformer.transform(profiles_num_test)

# train_err = np.linalg.norm(profiles_num - np.matmul(Z_factor, lambda_loading))
# test_err = np.linalg.norm(profiles_num_test - np.matmul(test_transformed, lambda_loading))


# print(train_err)
# print(test_err)


# c2 = np.sum(lambda_loading ** 2, axis=1)
# total_variance_ = np.sum(c2)
# pvars_ = 100 * c2 / total_variance_
# pvars_noise_ = 100 * c2 / (total_variance_ +
#                                         np.sum(noise))

# print(pvars_noise_)
# print(np.sum(pvars_))

# # plt.plot(pvars_noise_)
# # plt.xlabel('component index')
# # plt.ylabel('% explained variance');


# pvars_sort = np.sort(pvars_)
# plt.plot(np.cumsum(pvars_sort))



In [0]:
component_m = []
for person in Z_factor:
    index = np.argmax(person)
    component_m.append(index)

component_count = np.bincount(component_m)
plt.bar(range(1,6,1),component_count)
plt.xlabel("Factor #")
plt.ylabel("Number of people in factor")

plt.show()

# LDA


In [0]:
def display_topics(model, feature_names, no_top_words):
#     abs_components = np.absolute(model.components_)
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print("\n".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        print()



In [0]:
from sklearn.decomposition import LatentDirichletAllocation

k = 5
lda = LatentDirichletAllocation(n_components = k)
transformed_data = lda.fit_transform(profiles_num)

In [0]:
print("Component shape m:", lda.components_.shape)
print("User profiles m : ", transformed_data.shape)

no_top_words = 10
display_topics(lda, profiles_num.columns, no_top_words)

In [0]:
print(lda.score(profiles_num)/profiles_num.shape[0])
print(lda.score(profiles_num_test)/profiles_num_test.shape[0])

component_m = []
for person in transformed_data:
    index = np.argmax(person)
    component_m.append(index)

component_count = np.bincount(component_m)
print(component_count)
plt.bar(range(1,6,1),component_count)
plt.xlabel("Factor #")
plt.ylabel("Number of people in factor")

plt.show()

# for i in range(k):
#     plt.figure()
#     plt.hist(transformed_data[:, i], alpha=0.3, label="Latent User " + str(i+1),
#             range=(0,1), bins=20)
#     plt.xlabel("User Proportion from Latent User i", fontsize=20)
#     plt.ylabel("Count", fontsize=20)
#     plt.tick_params(labelsize=15)
#     plt.legend()
#     plt.show()

# Old  LDA

In [0]:
from mpl_toolkits import mplot3d

# pca3 = PCA(n_components=3)

# principalComponents = pca3.fit_transform(profiles_scale)
# principalDf = pd.DataFrame(data = principalComponents
#              , columns = ['principal component 1', 'principal component 2', 'principal component 3'])


fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(111, projection='3d')

ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_zlabel('Principal Component 3', fontsize = 15)
ax.set_title('3 component PCA', fontsize = 20)


ax.azim = 140

targets = [1,2,3,4,5]
colors = ['b', 'r','g','y','m']

# print(principalDf)
temp = np.array(profiles_lda['sex_m']==1)
principalDf_m = principalDf.loc[temp,:]


for target, color in zip(targets,colors):
    indicesToKeep = gmodels == target
    ax.scatter(principalDf_m.loc[indicesToKeep, 'principal component 1']
               ,principalDf_m.loc[indicesToKeep, 'principal component 2']
               ,principalDf_m.loc[indicesToKeep, 'principal component 3']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()

In [0]:
from sklearn.decomposition import LatentDirichletAllocation

lda_m20 = LatentDirichletAllocation(n_components = 20)
lda_m20.fit(profiles_lda_m.values)

print("Component shape m:", lda_m.components_.shape)

transformed_m20 = lda_m.transform(profiles_lda_m.values)

print("User profiles m : ", transformed_m20.shape)

no_top_words = 10
display_topics(lda_m20, profiles_lda_m.columns, no_top_words)

In [0]:
lda_f = LatentDirichletAllocation(n_components = 5)
lda_f.fit(profiles_lda_f.values)

print("Component shape f:", lda_f.components_.shape)

transformed_f = lda_f.transform(profiles_lda_f.values)

print("User profiles f : ", transformed_f.shape)

no_top_words = 10
display_topics(lda_f, profiles_lda_f.columns, no_top_words)

In [0]:
plt.figure()
for i in range(5):
    plt.hist(transformed_m[:, i], alpha=0.3, label=str(i+1),
            range=(0,1), bins=20)
plt.xlabel("User Weight in Component i", fontsize=20)
plt.ylabel("Count", fontsize=20)
plt.tick_params(labelsize=15)
plt.legend()
plt.show()

In [0]:
component_m = []
for person in transformed_m:
    index = np.argmax(person)
    component_m.append(index+1)
print(component_m)

s = pd.Series(component_m)
s.describe()


In [0]:
c1 = lda_m.components_[0, :]
c1_indices = np.where(c1 > np.percentile(c1, 80))[0]

#c1_indices = np.array(sorted(c1_indices, key= lambda x: c1[x], reverse=True))


top_c1_df = pd.DataFrame({'Component': profiles_onehot_m.columns.values[c1_indices],
                         'Weight': c1[c1_indices]})

top_c1_df.sort_values('Weight', inplace=True, ascending=False)

top_c1_df.head(n=10)

In [0]:
t1 = transformed_m[:, 4]
print(t1.shape)


print("Percentiles of user weights in this space:")


plt.figure()
plt.hist(t1)
plt.xlabel("User Load in Component 1", fontsize=20)
plt.ylabel("Frequency", fontsize=20)
plt.tick_params(labelsize=15)
plt.show()

In [0]:

categorical_columns = ['body_type',
                      'diet',
                      'drinks',
                      'drugs',
                      'education',
                      'job',
                      'offspring',
                      'orientation',
                      'pets',
                      'religion',
                       'sign',
                       'sex',
                       'smokes',
                       'status']
profiles_lda = pd.get_dummies(profiles, 
                                columns = categorical_columns)


profiles_lda = profiles_lda.drop(columns=['income','speaks','ethnicity'])


# for col in profiles_lda.columns:
#     if 'ethnicity' in col and col.count(',') > 1:
#         profiles_lda = profiles_lda.drop(columns = [col])
        
        
sub_columns = [c for c in profiles_lda.columns.values if 
  (profiles_lda[c].dtype == np.uint8) or 
   (profiles_lda[c].dtype == np.int64) or
   (profiles_lda[c].dtype == np.float64)]

profiles_lda = profiles_lda[sub_columns]

profiles_lda = profiles_lda.dropna()

        # split male/female
profiles_lda_m = profiles_lda.loc[profiles_lda["sex_m"] == 1,:]
profiles_lda_m = profiles_lda_m.drop(columns=['sex_m','sex_f'])
profiles_lda_f = profiles_lda.loc[profiles_lda["sex_f"] == 1,:]
profiles_lda_f = profiles_lda_f.drop(columns=['sex_m','sex_f'])   

        
        
