<a href="https://colab.research.google.com/github/lifepopkay/Tech-Monies/blob/Modelling/First_basic_model_John.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import pickle

In [2]:
url = 'https://raw.githubusercontent.com/lifepopkay/Tech-Monies/Modelling/Cleaned%20Data.csv'
df = pd.read_csv(url)

In [3]:
df.head(2)

Unnamed: 0,company,location,title scraped for,Country,State,age,Position,upper_salary,lower_salary,payment_frequency,year_of_exp,contract_type,eligibility,title_cleaned,skills_list
0,Offerzen,Lagos,Data Scientist,Nigeria,Lagos,,Senior Level,,,,5.0,Full-time,,Senior Software Engineer,"['asp', 'AWS', 'R', 'D', 'C']"
1,GVA Partners,Lagos,Data Scientist,Nigeria,Lagos,,,,,,4.0,,undergraduate,Data Scientist,"['Flask', 'MySQL', 'Keras', 'R', 'server', 'Ex..."


In [4]:
ML=['Machine Learning', 'Machine Learning ']
for title in ML:
    df['title scraped for'].replace(title,'Machine Learning Engineer', inplace = True)

In [5]:
# Dropping records with missing values for lower salary range
df.dropna(subset=["lower_salary"], inplace = True)

In [6]:
# Columns to drop
columns_to_drop = ["company", "location", "State", "age"]

In [7]:
# Drop unnecessary columns
df.drop(columns_to_drop, axis="columns", inplace = True)

In [8]:
# Selecting Job posting with years of experience less than 20 which is the 95th percentile of the Years of experience
df = df.query("year_of_exp <= 20 ")

In [9]:
# Defining a function the determines the Position based on the Years of Experience
def position_based_on_yoe(df_YOE):
    if df_YOE < 4:
        return "Entry Level"
    elif df_YOE > 4 and df_YOE < 7:
        return "Mid-Level"
    else:
        return "Senior Level"
        

In [10]:
# Filling the missing values for Position with "Position based on Years of experience"
df["Position_based_on_YOE"] = df.year_of_exp.apply(position_based_on_yoe)
df.reset_index(drop = True, inplace = True)

for i in range(df.shape[0]):
    if df.iloc[i,2] is np.nan:
        df.iloc[i,2] = df.iloc[i,-1]
        
df.drop('Position_based_on_YOE', axis= "columns", inplace = True)

In [11]:
# Replace "hs" with "high school diploma" in eligibility
df.eligibility.replace("hs",'high school diploma', inplace = True)

# Fill missing values for eligibility with "high school diploma"
df.eligibility.fillna('high school diploma', inplace = True)

In [12]:
# Fill missing values for contract_type with the mode of contract_type
df.contract_type.fillna(df.contract_type.mode()[0], inplace = True)

In [13]:
# Creating an exchange rate for each country currency with respect to UD dollars
Exchange_rate = {"Nigeria": 427.88,
"India": 79.97,
"USA": 1,
"UK": 1.17}
df["Exchange_rate"] = df.Country.map(Exchange_rate)

# Converting the salary currency for each Job to US dollars
df.upper_salary = df.upper_salary / df.Exchange_rate
df.lower_salary = df.lower_salary / df.Exchange_rate

df.drop('Exchange_rate', axis= "columns", inplace = True)

On exploring the salary for missing Payment frequency, it was discovered that the salary far exceeds the salary for hourly payment frequency and is far below the salary for minthly payment frequency. 

So the the missing values for payment frequency will be filled with "Weekly"

In [14]:
# Filling the missing values for payment frequency with "Weekly"
df.payment_frequency.fillna("Weekly", inplace = True)

In [15]:
df.shape

(1837, 11)

In [16]:
Frequency_to_yearly = {"Hourly": 40 * 52,
"Weekly": 52,
"Monthly": 12,
"Yearly": 1}
df["Frequency_to_yearly"] = df.payment_frequency.map(Frequency_to_yearly)

# Standardizing the payment frequency to Yearly
df.upper_salary = df.upper_salary * df["Frequency_to_yearly"]
df.lower_salary = df.lower_salary * df["Frequency_to_yearly"]
df.drop('Frequency_to_yearly', axis= "columns", inplace = True)

In [17]:
df.drop(["skills_list",'title_cleaned','payment_frequency'], axis="columns", inplace = True)

In [18]:
categorical_columns = ["title scraped for",'Country','Position','contract_type','eligibility']
df[categorical_columns] = df[categorical_columns].astype('category')
df6 = df.copy()
for x in categorical_columns:
        df[x] = df[x].cat.codes
   

### Categorical Column Coding map

In [26]:
# Create dict for categorical columns
feature_dict = {}
for col in df.drop(["lower_salary", "upper_salary"], axis="columns").columns:
  if col in categorical_columns:
    feature_dict[col] = dict(enumerate(df6[col].cat.categories))
  else:
    feature_dict[col] = None
# Title_scrapped_for  = dict(enumerate(df6['title scraped for'].cat.categories))
# COUNTRY  = dict(enumerate(df6['Country'].cat.categories))
# POSITION = dict(enumerate(df6['Position'].cat.categories))
# #PAYMENT_FREQUENCY = dict(enumerate(df6['payment_frequency'].cat.categories))
# CONTRACT_TYPE = dict(enumerate(df6['contract_type'].cat.categories))
# ELIGIBILTY = dict(enumerate(df6['eligibility'].cat.categories))
pickle.dump(feature_dict, open('features.pkl', 'wb'))

In [27]:
feature_dict

{'title scraped for': {0: 'Business Analyst',
  1: 'Data Analyst',
  2: 'Data Scientist',
  3: 'Machine Learning Engineer',
  4: 'Web Developer'},
 'Country': {0: 'India', 1: 'Nigeria', 2: 'UK', 3: 'USA'},
 'Position': {0: 'Entry Level', 1: 'Mid-Level', 2: 'Senior Level'},
 'year_of_exp': None,
 'contract_type': {0: 'Contract', 1: 'Full-time'},
 'eligibility': {0: 'doctorate',
  1: 'high school diploma',
  2: 'postgraduate',
  3: 'undergraduate'}}

In [None]:
COUNTRY

{0: 'India', 1: 'Nigeria', 2: 'UK', 3: 'USA'}

In [None]:
ELIGIBILTY

{0: 'doctorate',
 1: 'high school diploma',
 2: 'postgraduate',
 3: 'undergraduate'}

In [None]:
#PAYMENT_FREQUENCY

In [None]:
CONTRACT_TYPE

{0: 'Contract', 1: 'Full-time'}

In [None]:
# Fill the missing values for upper salary using an iterative imputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute       import IterativeImputer
#from catboost             import CatBoostRegressor

imputer = IterativeImputer(
                            verbose   = 2,
                            max_iter  = 20,
                            imputation_order = 'descending' )


df2 = pd.DataFrame(imputer.fit_transform(df), columns = df.columns) # preserve the column nam

[IterativeImputer] Completing matrix with shape (1837, 8)
[IterativeImputer] Ending imputation round 1/20, elapsed time 0.04
[IterativeImputer] Change: 501873.82142142946, scaled tolerance: 1600.0 
[IterativeImputer] Ending imputation round 2/20, elapsed time 0.08
[IterativeImputer] Change: 0.0, scaled tolerance: 1600.0 
[IterativeImputer] Early stopping criterion reached.


### Features

In [None]:
features_dict = {}
i = 0
for feature in list(df2.drop(["lower_salary", "upper_salary"], axis="columns").columns):
    features_dict[i] = feature
    i +=1
    
print(features_dict)

{0: 'title scraped for', 1: 'Country', 2: 'Position', 3: 'year_of_exp', 4: 'contract_type', 5: 'eligibility'}


In [None]:
# Extracting the features and target variable
y = df2[["lower_salary", "upper_salary"]].values
X = df2.drop(["lower_salary", "upper_salary"], axis="columns")

In [None]:
# Normalizing the data
X= StandardScaler().fit(X).transform(X)
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (1469, 6) (1469, 2)
Test set: (368, 6) (368, 2)


In [None]:
Lr = LinearRegression()
Lr.fit(X_train,y_train)

LinearRegression()

In [None]:
yhat = Lr.predict(X_test)

In [None]:
from sklearn.metrics import r2_score, mean_squared_error

In [None]:
r2_score(y_test, yhat)

0.5325986125948965

In [None]:
filename = 'firstbasicmodel.pkl'
pickle.dump(Lr, open(filename, 'wb'))