In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline
sns.set_style('darkgrid')

In [None]:
#Reading csv_file
df = pd.read_csv("Pakistan Available Job Dec 19 - Mar-21.csv")

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.columns

In [None]:
df['label'].fillna(method='ffill',inplace = True)

In [None]:
df['Company Name'].fillna(method='ffill',inplace = True)

In [None]:
df.isna().sum()

In [None]:
df.head()

# Exploratory Data Analysis
Exploratory data analysis (EDA) is used to analyze and investigate data sets and summarize their main characteristics, often employing data visualization methods.

In [None]:
fig, axes = plt.subplots(figsize=(8, 4))
df['label'].value_counts(normalize=True).plot.bar(width=0.3, color=('red','green'))

plt.tight_layout()
plt.show()

In [None]:
cols = df[['Job Name', 'label', 'Company Name', 'Job Type', 'Experience Required',
       'Department', 'JD', 'City', 'Date Posted']]
cols.head()

In [None]:
df = df.rename(columns={
    'Job Name': 'Job_Name',
    'Company Name': 'Company_Name',
    'Job Type': 'Job_Type',
    'Experience Required': 'Experience_Required',
    'Date Posted': 'Date_Posted'
})
df.head()

In [None]:
df['Job_Name'].count()

In [None]:
a = df['Job_Name'].unique().tolist()[0:10]
b = a[0].split(',')
b

In [None]:
f, ax = plt.subplots(figsize=(7, 3))
sns.countplot(y=b, data=df, color='c')

In [None]:
top_com = df['Company_Name'].unique()
top_com[0:5]

In [None]:
data_scientis_jobs = df[ df['Job_Name'].str.contains('Data Scientist')]
data_scientis_jobs

In [None]:
data_scientis_jobs['Job_Name'].count()

In [None]:
e = data_scientis_jobs['Experience_Required'].unique()
sns.countplot(x = e, data = df)
plt.xlabel('Experience')

In [None]:
c = data_scientis_jobs['City'].unique()
sns.countplot(x = c, data = df)
plt.xlabel('City')

# DATA ANALYSIS OF PAKISTAN DATA

In [None]:
pak_df = df['Job_Type'].value_counts()
pak_df

In [None]:
plt.figure(figsize=(10,10))
res=sns.barplot(x=pak_df, y=pak_df.index)
res.set_yticklabels(res.get_ymajorticklabels(), fontsize = 16, color='black')
plt.xlabel('Value Counts of Job Type',fontsize = 16, color='black')
plt.ylabel('Job Type Names',fontsize = 16, color='black')
plt.title('Job Types in Pakistan',fontsize = 16, color='black')
plt.show()

In [None]:
#How many years of experienced required for job
pak_df = df['Experience_Required'].value_counts()
pak_df

In [None]:
plt.figure(figsize=(10,10))
res=sns.barplot(x=pak_df, y=pak_df.index)
res.set_yticklabels(res.get_ymajorticklabels(), fontsize = 16, color='black')
plt.xlabel('Value Counts of Experienced Required for Job', fontsize = 16, color='black')
plt.ylabel('Names of Experienced Required', fontsize = 16, color='black')
plt.title('Experienced Required for Job in Pakistan', fontsize = 20, color='black')
plt.show()

In [None]:
#How many Jobs are available in each departments.
pak_df_Dept = df['Department'].value_counts().head(15)
pak_df_Dept

In [None]:
plt.figure(figsize=(10,8))
res=sns.barplot(x=pak_df_Dept, y=pak_df_Dept.index)
res.set_yticklabels(res.get_ymajorticklabels(), fontsize = 16, color='black')
plt.xlabel('Value Counts of Departments of Job', fontsize = 16, color='black')
plt.ylabel('Names of Departments', fontsize = 16, color='black')
plt.title('Departments of Job in Pakistan', fontsize = 20, color='black')
plt.show()

In [None]:
#How many Jobs are available in each city.
pak_df_City = df['City'].value_counts().head(15)
pak_df_City

In [None]:
plt.figure(figsize=(10,8))
res=sns.barplot(x=pak_df_City, y=pak_df_City.index)
res.set_yticklabels(res.get_ymajorticklabels(), fontsize = 16, color='black')
plt.xlabel('Value Counts of City of Job', fontsize = 16, color='black')
plt.ylabel('Names of City', fontsize = 16, color='black')
plt.title('City of Job in Pakistan', fontsize = 20, color='black')
plt.show()

In [None]:
#How many Jobs are Posted in each dates.
pak_df_Date = df['Date_Posted'].value_counts().head(10)
pak_df_Date

In [None]:
plt.figure(figsize=(10,8))
res=sns.barplot(x=pak_df_Date, y=pak_df_Date.index)
res.set_yticklabels(res.get_ymajorticklabels(), fontsize = 16)
plt.xlabel('Value Counts of Dates of Job', fontsize = 20)
plt.ylabel('Dates', fontsize = 20)
plt.title('Dates of Job Posted in Pakistan', fontsize = 25)
plt.show()

# converting string data to numeric using one hot Encoding

In [None]:
job = pd.get_dummies(df, drop_first= True)
job.head()

In [None]:
jobpk = pd.concat([df, job ], axis = 1)
jobpk.head()

In [None]:
categorical_features = ['Job_Name', 'Company_Name', 'Job_Type', 'Experience_Required', 'Department', 'JD', 'City', 'Date_Posted']

In [None]:
data = jobpk.drop(columns=categorical_features, axis=1)
data.head()

In [None]:
corr = data.corr()['label'].sort_values(ascending=False).to_frame()
plt.figure(figsize=(2,8))
sns.heatmap(corr, cmap='Blues', cbar=False, annot=True)
plt.show()

In [None]:
Y = data['label']
X = data.drop('label', axis=1)

In [None]:
#LogisticRegression
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [None]:
Logreg_predict = logreg.predict(testX)
print("Prediction Using Logistic Regression on test set: {}".format(logreg_predict))

In [None]:
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

In [None]:
y_pred = kc.predict(X_test)
print('Accuracy of KNeighbors classifier on train set: {:.2f}'.format(kc.score(X_train, y_train)))

In [None]:
from sklearn.neighbors import KNeighborsClassifier
kc = KNeighborsClassifier()
kc.fit(x_train,y_train)

In [None]:
y_pred = kc.predict(X_test)
print('Accuracy of KNeighbors classifier on test set: {:.2f}'.format(kc.score(X_test, y_test)))

In [None]:
y_pred = kc.predict(X_test)
print('Accuracy of KNeighbors classifier on train set: {:.2f}'.format(kc.score(X_train, y_train)))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))