In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# display all columns
pd.set_option('display.max_columns', None)
# display all rows
pd.set_option('display.max_rows', None)

In [None]:
datacomb_new = pd.read_csv('data_allthreeyears_combined_new1.csv')

In [None]:
datacomb_new = datacomb_new.replace({'Gender - Selected Choice': 'Gender'})

In [None]:
datacomb_new['Location'].unique()

In [None]:
datacomb_new['Location'].nunique()

In [None]:
list(datacomb_new.columns)

In [None]:
datacomb_new['year'].dtype

In [None]:
datacomb_new.shape

In [None]:
categorical_columns = datacomb_new.select_dtypes(include=['object']).columns.tolist()
categorical_columns

# drop the 'year' column

In [None]:
# drop the 'year' column

datacomb_new = datacomb_new.drop('year', axis = 1)

In [None]:
datacomb_new.shape

In [None]:
unique_counts = datacomb_new.nunique(dropna=False)

In [None]:
# unique_counts

In [None]:
binary_cols = unique_counts[unique_counts <= 2].index.tolist()

In [None]:
non_binary_cols = unique_counts[unique_counts > 2].index.tolist()
non_binary_cols

In [None]:
# number of binary cols in the dataset
len(binary_cols)

In [None]:
datacomb_new.shape

In [None]:
datacomb_new.sample(2)

In [None]:
# binary_cols

In [None]:
# # sanity check only
# for i in binary_cols:
#     print(i)
#     print(datacomb_new[i].unique())
#     print('=====================')
    

In [None]:
# # sanity check only
# for i in non_binary_cols:
#     print(i)
#     print(datacomb_new[i].unique())
#     print('=====================')
    

In [None]:
non_binary_cols

In [None]:
# check if the columns in non_binary_cols have NaN values
# columns with no NaN values may have the multi-colinearity problem when we one-hot-encode it with pd.get_dummies
for i in non_binary_cols:
    print(i, "---------", datacomb_new[i].isna().any().any())

In [None]:
datacomb_new[non_binary_cols].count()

# Label Binary Columns to 0 and 1

In [None]:
datacomb_new[binary_cols] = np.where((datacomb_new[binary_cols] != 0) & (~datacomb_new[binary_cols].isna()), 1, 0)

In [None]:
datacomb_new.shape

In [None]:
datacomb_new.head()

In [None]:
datacomb_new['Job_title - Selected Choice'].isna().sum() # number of rows with the Job_title blank

In [None]:
# drop rows with empty job title
datacomb_new = datacomb_new.dropna(subset = ['Job_title - Selected Choice'])

In [None]:
datacomb_new['Job_title - Selected Choice'].isna().sum() # number of rows with the Job_title blank is now 0

In [None]:
datacomb_new.shape

In [None]:
# drop rows with student as job title
datacomb_new = datacomb_new[datacomb_new['Job_title - Selected Choice'] != 'Student']
datacomb_new.shape

In [None]:
print(len(binary_cols))
print(len(non_binary_cols))

# Dropping cols we think is not associated to the job title

In [None]:
datacomb_new.shape

In [None]:
job_title_dict = {
    'Data Analyst (Business, Marketing, Financial, Quantitative, etc)': 'Data Analyst',
    'Product Manager': 'Product/Project/Program Manager',
    'Product/Project Manager': 'Product/Project/Program Manager',
    'Program/Project Manager':'Product/Project/Program Manager',
    'Machine Learning Engineer':'Machine Learning/ MLops Engineer'}

def replace_text(cell_value, replacements):
    if cell_value is not None and not pd.isna(cell_value):
        # Check if the cell_value is a float, and if so, convert it to a string.
        if isinstance(cell_value, float):
            cell_value = str(cell_value)
        cell_value = replacements.get(cell_value,cell_value)
    return cell_value

datacomb_new['Job_title - Selected Choice'] = datacomb_new['Job_title - Selected Choice'].apply(replace_text, replacements=job_title_dict)
datacomb_new['Job_title - Selected Choice'].unique()

In [None]:
Job_title = datacomb_new.pop('Job_title - Selected Choice')
datacomb_new.insert(len(datacomb_new.columns), 'Job_title - Selected Choice', Job_title)

In [None]:
cols_to_drop = ['Job_No.OfDSTeamMember', 'Job_EmployerUsingML?','Money Spent on ML/Cloud Computing','Times used TPU','Job_title - Selected Choice']
datacomb_new_wo_Jtitle = datacomb_new.drop(cols_to_drop, axis = 1)


In [None]:
filtered_non_binary_cols = [item for item in non_binary_cols if item not in cols_to_drop]

In [None]:
filtered_non_binary_cols

In [None]:
encoded_df = pd.get_dummies(datacomb_new_wo_Jtitle, columns = filtered_non_binary_cols, prefix_sep=' - ')

In [None]:
encoded_df.shape

In [None]:
encoded_df[encoded_df['Popular IDEs - Click to write Choice 13'] ==1].count()

In [None]:
encoded_df.drop('Age - 70+', axis = 1, inplace = True) # to remove multi-colinearity

In [None]:
encoded_df.shape # verifying that there is one less column

In [None]:
encoded_df.head()

In [None]:
a = list(encoded_df.columns)
a.sort()
a

## Random Forest

In [None]:
from sklearn.model_selection import train_test_split

rng = np.random.RandomState(seed=42)
X_train, X_test, y_train, y_test = train_test_split( encoded_df, Job_title , test_size=0.20, random_state= rng)

print( X_train.shape )
print( X_test.shape )

In [None]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier( n_estimators=100, max_leaf_nodes=15, n_jobs=-1 )

rnd_clf.fit( X_train, y_train )

y_pred_rf = rnd_clf.predict( X_test )

In [None]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from pprint import pprint

def evaluate(test, pred):
    print(classification_report( test, pred ))
    # Calculate precision
    precision = precision_score(test, pred, average='micro')
    
    # Calculate recall
    recall = recall_score(test, pred, average='micro')
    
    print("Precision: ", precision)
    print("Recall: ", recall)


In [None]:
evaluate( y_test, y_pred_rf )

In [None]:
def RandomForest_HyperParameter_Evaluator(param, X_train, y_train, X_test, y_test):
    rf = RandomForestClassifier(**param)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    evaluate(y_test, y_pred)
    return rf

In [None]:
# finding best parameters (1)

# from sklearn.model_selection import GridSearchCV

# params_to_test = {
#     'n_estimators':[10,25,50],#,100,200,500
#     'max_depth':[3,5,10] #15,20,25
# }

# #here you can put any parameter you want at every run, like random_state or verbosity
# rf_model = RandomForestClassifier(random_state=rng)
# #here you specify the CV parameters, number of folds, numberof cores to use...
# grid_search = GridSearchCV(rf_model, param_grid=params_to_test, cv=10, scoring='f1_macro', n_jobs=4)

# grid_search.fit(X_train, y_train)

# best_params = grid_search.best_params_ 

# #best_params is a dict you can pass directly to train a model with optimal settings 
# best_model = RandomForestClassifier(**best_params)

In [None]:
# best_params
# {'max_depth': 10, 'n_estimators': 50}

In [None]:
# best_model

In [None]:
# best_model.fit( X_train, y_train )
# best_model

In [None]:
# y_pred_rf = best_model.predict( X_test )

In [None]:

evaluate( y_test, y_pred_rf )

In [None]:
# Finding best parameters (2)
# from sklearn.model_selection import RandomizedSearchCV
# from pprint import pprint
# # Number of trees in random forest
# n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# # Number of features to consider at every split
# max_features = ['auto', 'sqrt']
# # Maximum number of levels in tree
# max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
# max_depth.append(None)
# # Minimum number of samples required to split a node
# min_samples_split = [2, 5, 10]
# # Minimum number of samples required at each leaf node
# min_samples_leaf = [1, 2, 4]
# # Method of selecting samples for training each tree
# bootstrap = [True, False]
# # Create the random grid
# random_grid = {'n_estimators': n_estimators,
#                'max_features': max_features,
#                'max_depth': max_depth,
#                'min_samples_split': min_samples_split,
#                'min_samples_leaf': min_samples_leaf,
#                'bootstrap': bootstrap}
# pprint(random_grid)

In [None]:
# # !!! resource intensive !!!
# # Use the random grid to search for best hyperparameters
# # First create the base model to tune
# rf = RandomForestClassifier()
# # Random search of parameters, using 3 fold cross validation, 
# # search across 100 different combinations, and use all available cores
# rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# # Fit the random search model
# rf_random.fit(X_train, y_train)


In [None]:
# rf_random.best_params_
# {'n_estimators': 1600,
#  'min_samples_split': 5,
#  'min_samples_leaf': 1,
#  'max_features': 'sqrt',
#  'max_depth': 70,
#  'bootstrap': False}

In [None]:
# rf_random.best_score_ 


In [None]:
# best_model_2 = RandomForestClassifier(**rf_random.best_params_)
# best_model_2

In [None]:
# best_model_2.fit(X_train, y_train)

In [None]:
# y_pred_best_2 = best_model_2.predict(X_test)

In [None]:
# evaluate( y_test, y_pred_best_2 )

# remove
# 'Currently not employed'

# merge

In [None]:
# from sklearn.model_selection import GridSearchCV
# # Create the parameter grid based on the results of random search 
# param_grid = {
#     'bootstrap': [True],
#     'max_depth': [80, 90, 100, 110],
#     'max_features': [2, 3],
#     'min_samples_leaf': [3, 4, 5],
#     'min_samples_split': [8, 10, 12],
#     'n_estimators': [100, 200, 300, 1000]
# }
# # Create a based model
# rf = RandomForestClassifier()
# # Instantiate the grid search model
# grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
#                           cv = 3, n_jobs = -1, verbose = 2)

In [None]:
def print_all_columns(df):
    a = list(df.columns)
    a.sort()
    return a

## Exploration 1
1. Remove country

In [None]:
exploration_df_1 = datacomb_new_wo_Jtitle.copy()
exploration_df_1 = exploration_df_1[exploration_df_1.columns.drop(list(exploration_df_1.filter(regex='Location')))]

print_all_columns(exploration_df_1)
exploration_df_1.shape

In [None]:
filtered_non_binary_cols_dum = filtered_non_binary_cols.copy()
exploration_1_encoded_df = pd.get_dummies(exploration_df_1, columns = filtered_non_binary_cols_dum.remove('Location'), prefix_sep=' - ')
exploration_1_encoded_df.shape

In [None]:
exploration_1_encoded_df.drop('Age - 70+', axis = 1, inplace = True) # to remove multi-colinearity

In [None]:
print_all_columns(exploration_1_encoded_df)

In [None]:
# train test split
X_train, X_test, y_train, y_test = train_test_split( exploration_1_encoded_df, Job_title , test_size=0.20, random_state= rng)

print( X_train.shape )
print( X_test.shape )

In [None]:

# exploration_1_rnd_clf = RandomForestClassifier(**rf_random.best_params_)

# exploration_1_rnd_clf.fit( X_train, y_train )

# exploration_1_y_pred_rf = exploration_1_rnd_clf.predict( X_test )

In [None]:
# evaluate(y_test, exploration_1_y_pred_rf)

## "Final"

In [None]:
hyper_parameters = {'n_estimators': 1600,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 70,
 'bootstrap': False}

In [None]:
jobs_df = pd.DataFrame(y_train)
jobs_count = pd.DataFrame(jobs_df.groupby('Job_title - Selected Choice')['Job_title - Selected Choice'].count())
jobs_count.index.name = None

jobs_df_test = pd.DataFrame(y_test)
jobs_count_test = pd.DataFrame(jobs_df_test.groupby('Job_title - Selected Choice')['Job_title - Selected Choice'].count())
jobs_count_test.index.name = None

jobs_count_all = pd.concat([jobs_count, jobs_count_test], axis=1)
jobs_count_all.columns=['train','test']

jobs_count_all.sort_values('train')

# jobs_df.groupby('Job_title - Selected Choice').count()
# y_test['Data Administrator','Developer Advocate','Data Architect','Developer Relations/Advocacy']


In [None]:
final_rf = RandomForestClassifier(**hyper_parameters)
final_rf

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
final_rf.fit(X_train, y_train)

In [None]:
final_y_pred = final_rf.predict(X_test)
final_y_pred

In [None]:
# this one the last final model
evaluate(y_test, final_y_pred)

In [None]:
final_y_pred_proba = final_rf.predict_proba(X_test)
final_y_pred_proba

In [None]:
final_y_pred_proba.shape

In [None]:
len(set(Job_title))

In [None]:
import numpy as np

def mark_largest_values(matrix, x=1):
    result_matrix = np.zeros_like(matrix)
    indices = []
    for i in range(matrix.shape[0]):
        row = matrix[i, :]
        indices_of_largest_values = np.argsort(-row)[:x]
        # print(indices_of_largest_values)
        # print(row[indices_of_largest_values])
        result_matrix[i, indices_of_largest_values] = 1
        indices.append(indices_of_largest_values)

    return result_matrix, indices

def get_x_recommendation(model, x_test, x=1):
    pred_proba = model.predict_proba(x_test)
    
    matrix, indices = mark_largest_values(pred_proba, x)

    x_recommendation = [ [model.classes_[i] for i in idx]for idx in indices  ]

    return x_recommendation

matrix, indices = mark_largest_values(final_y_pred_proba)

final_rf.classes_

In [None]:
recommended_titles_3 = [ [final_rf.classes_[i] for i in idx]for idx in indices  ] # sorted by best probability
# recommended_titles_3

In [None]:
y_test_2 = [[y] for y in y_test]
print(y_test_2[:3])
print(recommended_titles_3[:3])
# evaluate(y_test_2,recommended_titles_3)

In [None]:

from sklearn.preprocessing import MultiLabelBinarizer
def eval_multi(test, pred):
    test_multi = [[y] for y in test]
    mlb = MultiLabelBinarizer()
    
    mlb_fit = mlb.fit(test_multi)
    print(mlb_fit.classes_)
    
    test_transformed = mlb_fit.transform(test_multi)
    pred_transformed = mlb_fit.transform(pred)
    print(test_transformed)
    print(pred_transformed)
    
    evaluate(test_transformed, pred_transformed)

eval_multi(y_test, recommended_titles_3)

In [None]:
# compare best probability n pred
# best_prob = np.array(recommended_titles_3)[:, 0]
# evaluate(final_y_pred, best_prob)

In [None]:
a = list(exploration_1_encoded_df.columns)
a.sort()
a

In [None]:
# visualize (only can visualize 1 tree)
# from sklearn.tree import export_graphviz

# export_graphviz(final_rf,
#                 feature_names=exploration_1_encoded_df.columns,
#                 filled=True,
#                 rounded=True)

# os.system('dot -Tpng tree.dot -o tree.png')


## Post final exploration - 1

In [None]:
pfexp_df = exploration_df_1.copy()

pfexp_df = pfexp_df[pfexp_df.columns.drop(list(pfexp_df.filter(regex='Popular IDEs')))]
list(pfexp_df.columns)

In [None]:
xcl_loc = filtered_non_binary_cols.copy()
xcl_loc.remove('Location')

In [None]:
print_all_columns(pfexp_df)

In [None]:
# ordinal encodings
ord_encodings = pd.read_csv('ordinal_encodings_2.csv')
# print(pfexp_df.Job_Salary.unique())
included_columns = ['Age', 'Education level_attainedOrGGtoAttain', 'Coding Experience (in years)', 'Years in ML', 
                    'Job_Salary']
nominal_features_order_dict = {}
for col in included_columns:
    nominal_features_order_dict[col] = dict(zip(ord_encodings[col], ord_encodings[col+'_encoded']))



pfexp_df.replace(nominal_features_order_dict,inplace=True)
print_all_columns(pfexp_df)
gender_dict = {'Man': 1, 'Woman': 2}
pfexp_df.replace(gender_dict,inplace=True)
pfexp_df.replace({np.NaN:0},inplace=True)

for col in xcl_loc:
    print('')
    print(col)
    print(pfexp_df[col].unique())

In [None]:
pfexp_df

X_train, X_test, y_train, y_test = train_test_split( pfexp_df, Job_title, test_size=0.20, random_state= rng)

print( X_train.shape )
print( X_test.shape )

In [None]:
# Use the random grid to search for best hyperparameters
# # First create the base model to tune
# rf = RandomForestClassifier()
# # Random search of parameters, using 3 fold cross validation, 
# # search across 100 different combinations, and use all available cores
# rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 30, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# # Fit the random search model
# rf_random.fit(X_train, y_train)

In [None]:
# rf_random.best_params_


In [None]:
so_called_best_param = {'n_estimators': 1200,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 50,
 'bootstrap': False}
final_rf = RandomForestClassifier(**so_called_best_param)
final_rf

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
final_rf.fit(X_train, y_train)

In [None]:
final_y_pred = final_rf.predict(X_test)
final_y_pred

In [None]:
evaluate(y_test, final_y_pred)

In [None]:
recommendation_3 = get_x_recommendation(final_rf, X_test, x=3)

eval_multi(y_test, recommendation_3)

In [None]:
# from sklearn.model_selection import GridSearchCV
# # Create the parameter grid based on the results of random search 
# param_grid = {
#     'bootstrap': [True, False],
#     'max_depth': [40,45,50,55,60],
#     'max_features': ['auto', 'sqrt'],
#     'min_samples_leaf': [2, 3],
#     'min_samples_split': [2, 3],
#     'n_estimators': [10,15,20,30,50,1000, 1100, 1150, 1200, 1250]
# }
# # Create a based model
# rf = RandomForestClassifier()
# # Instantiate the grid search model
# grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
#                           cv = 3, n_jobs = -1, verbose = 2)

In [None]:
# grid_rf = grid_search.fit(X_train, y_train)

In [None]:
# grid_rf.best_params_
# {'bootstrap': False,
#  'max_depth': 55,
#  'max_features': 'sqrt',
#  'min_samples_leaf': 2,
#  'min_samples_split': 3,
#  'n_estimators': 1100}

In [None]:
# from sklearn.model_selection import GridSearchCV
# # Create the parameter grid based on the results of random search 
# param_grid = {
#     'bootstrap': [True, False],
#     'max_depth': [40,45,50,55,60],
#     'max_features': ['sqrt'],
#     'min_samples_leaf': [2, 3],
#     'min_samples_split': [2, 3],
#     'n_estimators': [10,15,20,30,50]
# }
# # Create a based model
# rf = RandomForestClassifier()
# # Instantiate the grid search model
# grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
#                           cv = 3, n_jobs = -1, verbose = 2)

# grid_rf = grid_search.fit(X_train, y_train)
# grid_rf

In [None]:
# grid_rf.best_params_

# {'bootstrap': False,
#  'max_depth': 50,
#  'max_features': 'sqrt',
#  'min_samples_leaf': 2,
#  'min_samples_split': 3,
#  'n_estimators': 50}

In [None]:
best_grid = {'bootstrap': False,
 'max_depth': 50,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 3,
 'n_estimators': 50}

In [None]:
best_random = {'bootstrap': False,
 'max_depth': 55,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 3,
 'n_estimators': 1100}

In [None]:
best_grid_forest = RandomForest_HyperParameter_Evaluator(best_grid, X_train, y_train, X_test, y_test)
# best_grid_forest.feature_importances_

In [None]:
RandomForest_HyperParameter_Evaluator(best_random, X_train, y_train, X_test, y_test)


In [None]:
X_train.sample()

In [None]:
best_random_2 = best_random.copy()
print(best_random_2)
best_random_2['max_depth'] = 15
best_random_2['max_features'] = 5
print(best_random_2)
RandomForest_HyperParameter_Evaluator(best_random_2, X_train, y_train, X_test, y_test)


In [None]:
for col in X_train.columns:
    if len(X_train[col].unique()) == 1:
        print("this fraud: ", col)