In [143]:
# import necessary libraries
import os
import zipfile
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder
import plotly.express as px
import statsmodels.api as sm

# load the data
data_paths = ['data_2017.csv']  # file path for the dataset

# check if dataset exists
if not os.path.exists(data_paths[0]):  # check if dataset exists
    print("file not found")  # if file is not found
else:
    data_df = pd.read_csv(data_paths[0])  # load the dataset
    print(data_df.shape)  # print the shape of the dataset

# check for missing values
print("missing values count:", data_df.isnull().sum())  # check for missing values

# rename columns for clarity (example column renaming)
data_df.rename(columns = {
    'Age': 'age', 
    'Gender': 'gender', 
    'Country': 'country', 
    'State': 'state', 
    'self_employed': 'self_employed', 
    'family_history': 'family_history', 
    'remote_work': 'remote_work', 
    'tech_company': 'tech_company', 
    'benefits': 'benefits'
}, inplace=True)  # rename columns as needed

# drop columns with too many missing values (80% threshold)
data_df.dropna(axis=1, thresh=0.8*len(data_df), inplace=True)  # drop columns with more than 20% missing values

# check data types and non-null counts
print(data_df.info())  # display data types and non-null counts

# handle missing values in specific columns (example: 'tech_company' column)
data_df['tech_company'].fillna('other', inplace=True)  # fill missing tech company values with 'other'

# examine the distribution of categorical variables
for col in data_df.select_dtypes(include=['object']).columns:  # for each categorical column
    print(f'distribution for {col}:')  # display column name
    print(data_df[col].value_counts())  # print value counts for each column

# handle outliers by checking for extreme values (for numerical columns)
for col in ['age', 'salary']:  # for columns like age or salary
    mean = data_df[col].mean()  # calculate mean
    std = data_df[col].std()  # calculate standard deviation
    data_df[col] = data_df[col].apply(lambda x: mean if abs(x - mean) > 3*std else x)  # replace outliers

# correlation matrix for numerical features
numeric_df = data_df.select_dtypes(include=[np.number])  # select numerical columns
correlation_matrix = numeric_df.corr()  # compute correlation matrix
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')  # plot heatmap
plt.show()  # display the plot

# apply feature selection using SelectKBest
X = data_df.drop('target_variable', axis=1)  # feature set excluding target
y = data_df['target_variable']  # target variable
selector = SelectKBest(chi2, k=5)  # select top 5 features
X_new = selector.fit_transform(X, y)  # apply feature selection

# regression analysis example (logistic regression)
X = sm.add_constant(X)  # add constant term to X for intercept
model = sm.Logit(y, X)  # logistic regression model
results = model.fit()  # fit the model
print(results.summary())  # display regression results

# visualize the distribution of a chosen feature (example: 'age')
plt.hist(data_df['age'], bins=30, color='skyblue')  # histogram of a feature
plt.title('distribution of age')  # title of the plot
plt.xlabel('age')  # x-axis label
plt.ylabel('frequency')  # y-axis label
plt.show()  # display the plot

# focus on tech-related employees
# filter only tech company related rows
data_df = data_df[data_df['tech_company'] == 1]  # keep only tech-related employees

# handle missing values for categorical features like 'benefits' column
data_df['benefits'] = data_df['benefits'].map({'eligible': 'yes', 'not eligible': 'no'})  # map eligibility for benefits

# visualize benefits distribution
data_df['benefits'].value_counts().plot(kind='pie', autopct='%1.1f%%', colors=['skyblue', 'lightgreen'])  # pie chart for benefits
plt.title('benefits eligibility')  # title of the plot
plt.show()  # display the plot

# apply feature selection using chi-square test (example)
X = data_df.drop('target_variable', axis=1)  # features
y = data_df['target_variable']  # target variable
chi2_selector = SelectKBest(chi2, k='all')  # select all features
chi2_selector.fit(X, y)  # apply chi-square test
print("feature scores:")  # print feature scores
for i in range(len(chi2_selector.scores_)):  # print score for each feature
    print(f'{X.columns[i]}: {chi2_selector.scores_[i]}')  # print feature name and score

# correlation analysis (convert categorical columns to numeric)
label_encoder = LabelEncoder()  # initialize label encoder
for col in data_df.select_dtypes(include=['object']).columns:  # for categorical columns
    data_df[col] = label_encoder.fit_transform(data_df[col])  # transform to numeric values

# plot correlation matrix again after converting categories to numeric
numeric_df = data_df.select_dtypes(include=[np.number])  # select numerical columns
correlation_matrix = numeric_df.corr()  # compute correlation matrix
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', annot_kws={"size": 6})  # plot heatmap
plt.title('correlation matrix')  # title of the plot
plt.show()  # display the plot

# regression analysis (logistic regression example)
X = sm.add_constant(X)  # add constant for intercept
model = sm.Logit(y, X)  # logistic regression model
results = model.fit()  # fit model
print('results of regression analysis:')  # print regression results
print(results.summary())  # display results summary


(756, 123)
missing values count: #                                                                 0
<strong>Are you self-employed?</strong>                           0
How many employees does your company or organization have?      113
Is your employer primarily a tech company/organization?         113
Is your primary role within your company related to tech/IT?    113
                                                               ... 
What country do you <strong>work</strong> in?                     2
What US state or territory do you <strong>work</strong> in?     253
Start Date (UTC)                                                  0
Submit Date (UTC)                                                 0
Network ID                                                        0
Length: 123, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 756 entries, 0 to 755
Data columns (total 55 columns):
 #   Column                                                                             

KeyError: 'tech_company'