In [10]:
# Grant permission to read/write to the owner only to Kaggle credentials
import os
import stat

# Path to the kaggle.json file (using a raw string to handle backslashes)
kaggle_json_path = os.path.expanduser(r"C:\code\Machine Learning\Project\Project1-Salary\kaggle\kaggle.json")

# Set permissions to read/write for the user only
os.chmod(kaggle_json_path, stat.S_IRUSR | stat.S_IWUSR)

print(f"Permissions set to 600 for {kaggle_json_path}")


Permissions set to 600 for C:\code\Machine Learning\Project\Project1-Salary\kaggle\kaggle.json


In [11]:
# Load and preview the first few lines of the dataset
import pandas as pd

# Replace with the actual path to your extracted file
df = pd.read_csv(r"C:\code\Machine Learning\Project\Project1-Salary\dataset_directory\Salary Data.csv")

# Display the first few rows of the DataFrame
print(df.head())


    Age  Gender Education Level          Job Title  Years of Experience  \
0  32.0    Male      Bachelor's  Software Engineer                  5.0   
1  28.0  Female        Master's       Data Analyst                  3.0   
2  45.0    Male             PhD     Senior Manager                 15.0   
3  36.0  Female      Bachelor's    Sales Associate                  7.0   
4  52.0    Male        Master's           Director                 20.0   

     Salary  
0   90000.0  
1   65000.0  
2  150000.0  
3   60000.0  
4  200000.0  


In [12]:
# Normalize 1 - Drop all record have null value
df = df.dropna(how='any')


In [13]:
# Normalize 2 - Drop duplicate record
df = df.drop_duplicates()


In [14]:
print(df.head())

    Age  Gender Education Level          Job Title  Years of Experience  \
0  32.0    Male      Bachelor's  Software Engineer                  5.0   
1  28.0  Female        Master's       Data Analyst                  3.0   
2  45.0    Male             PhD     Senior Manager                 15.0   
3  36.0  Female      Bachelor's    Sales Associate                  7.0   
4  52.0    Male        Master's           Director                 20.0   

     Salary  
0   90000.0  
1   65000.0  
2  150000.0  
3   60000.0  
4  200000.0  


In [15]:
# Normalize 3 - Process the ordinal string data

import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Get all unique categories in a specific column, e.g., 'column_name'
categories = df['Education Level'].unique()
print("Education Levels:", categories)

education_mapping = {"Bachelor's": 1, "Master's": 2, 'PhD': 3}


# # Apply mapping to ordinal features
df['Education Level'] = df['Education Level'].map(education_mapping)


print(df.head())


Education Levels: ["Bachelor's" "Master's" 'PhD']
    Age  Gender  Education Level          Job Title  Years of Experience  \
0  32.0    Male                1  Software Engineer                  5.0   
1  28.0  Female                2       Data Analyst                  3.0   
2  45.0    Male                3     Senior Manager                 15.0   
3  36.0  Female                1    Sales Associate                  7.0   
4  52.0    Male                2           Director                 20.0   

     Salary  
0   90000.0  
1   65000.0  
2  150000.0  
3   60000.0  
4  200000.0  


In [16]:
# Normalize 4 - Process the non-ordinal string data
import pandas as pd

# Apply one-hot encoding with drop_first=True to avoid multicollinearity
df = pd.get_dummies(df, columns=['Gender'], drop_first=True)


# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the 'Gender' column
df['Job Title'] = label_encoder.fit_transform(df['Job Title'])

import pickle
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

print(df.head())


    Age  Education Level  Job Title  Years of Experience    Salary  \
0  32.0                1        159                  5.0   90000.0   
1  28.0                2         17                  3.0   65000.0   
2  45.0                3        130                 15.0  150000.0   
3  36.0                1        101                  7.0   60000.0   
4  52.0                2         22                 20.0  200000.0   

   Gender_Male  
0         True  
1        False  
2         True  
3        False  
4         True  


In [17]:
# 1, 2, 3 Check Check ay do

# Check after normalization

print(df[['Age', 'Gender_Male', 'Salary']].head())


    Age  Gender_Male    Salary
0  32.0         True   90000.0
1  28.0        False   65000.0
2  45.0         True  150000.0
3  36.0        False   60000.0
4  52.0         True  200000.0


In [18]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
import pickle
import pandas as pd

# Separate the features and the labels
X = df.drop(columns=["Salary"])
y = df[["Salary"]]

# Assuming `X_train` is your training data
imputer = SimpleImputer(strategy="mean")
imputer.fit_transform(X)

# Save the imputer for later use
with open('imputer.pkl', 'wb') as f:
    pickle.dump(imputer, f)

In [19]:


# Initialize MinMaxScaler for scaling features
scaler = MinMaxScaler()

# Fit and transform X to scale features between 0 and 1
X_normalized = scaler.fit_transform(X)

# Convert the scaled data back to a DataFrame for easy viewing (optional)
X_normalized = pd.DataFrame(X_normalized, columns=X.columns)

# Optional: Scale the target variable y (if needed)
y_scaler = MinMaxScaler()
y_normalized = y_scaler.fit_transform(y)

# Convert scaled y back to a DataFrame (optional)
y_normalized = pd.DataFrame(y_normalized, columns=["Salary"])

# Display the normalized features and target variable (optional)
print("Normalized Features (X):")
print(X_normalized.head())

print("\nNormalized Target (y):")
print(y_normalized.head())

# Save the fitted feature scaler
with open('feature_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Save the fitted target scaler (if you scaled y)
with open('target_scaler.pkl', 'wb') as f:
    pickle.dump(y_scaler, f)


Normalized Features (X):
        Age  Education Level  Job Title  Years of Experience  Gender_Male
0  0.300000              0.0   0.919075                 0.20          1.0
1  0.166667              0.5   0.098266                 0.12          0.0
2  0.733333              1.0   0.751445                 0.60          1.0
3  0.433333              0.0   0.583815                 0.28          0.0
4  0.966667              0.5   0.127168                 0.80          1.0

Normalized Target (y):
     Salary
0  0.359103
1  0.258963
2  0.599439
3  0.238935
4  0.799720


In [20]:
print(X.columns)

Index(['Age', 'Education Level', 'Job Title', 'Years of Experience',
       'Gender_Male'],
      dtype='object')


In [21]:
# Initialize and train the Linear Regression model
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_normalized, y_normalized)


In [22]:
import pickle

# Save the model to a file
with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)
print("Saved model into model.pkl file!")


Saved model into model.pkl file!


In [None]:
# ngrok authtoken "2oYqav4hxyp9Ui3J5iENsLb3ogr_tL1VSUuXnH1Sg4fwyeoR"

: 