# Importing libraries

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


In [13]:
df  = pd.read_csv("../input/engineering-placements-prediction/collegePlace.csv")
df.head()
df.tail()

how many male or female students have backlogs?

how many students in each stream?

how many students have gpa >8?

how many students are hostlers or dayschollers?

how many students had internships?



In [14]:
df.info()

In [15]:
df.describe()

In [16]:
df['Gender'].value_counts()

In [17]:
import plotly.express as px
from plotly.subplots import make_subplots
px.histogram(df, x = "Gender", title = "<b>Total Male and Female</b>")

In [33]:
fig = px.pie(df, names = "Gender",
             title = "<b>Counts in Gender</b>",
             hole = 0.5)

fig.update_traces(textposition='inside',
                  textinfo='percent+label',
                  marker=dict(line=dict(color='#000000', width = 1.5)))


fig.show()

# Number of students having backlog in past.

In [19]:
df['HistoryOfBacklogs'].value_counts()
print("Number of students having backlog in past are: ",df['HistoryOfBacklogs'].value_counts()[1])

# Number of students are hostlers or day scholar.

In [20]:
df['Hostel'].value_counts()
print("Number of students who stay in hostel are : ",df['Hostel'].value_counts()[1])
print("Number of students who are day scholar's : ",df['Hostel'].value_counts()[0])

# Ayerage age of students

In [30]:
fig = px.histogram(df, 'Age',
                   title="<b>Average Age of Student</b>")

fig.add_vline(x=df['Age'].mean(), line_width=2, line_dash="dash", line_color="red")

fig.show()

In [35]:
male = df[df['Gender'] == "Male"]
female = df[df['Gender'] == "Female"]
total_male = male.shape[0]
total_female = female.shape[0]
total_male_pass = male[male['PlacedOrNot'] == 1].shape[0]
total_female_pass = female[female['PlacedOrNot'] == 1].shape[0]
pass_male_percentage = np.round((total_male_pass * 100) / total_male,2)
pass_female_percentage = np.round((total_female_pass * 100) / total_female,2)
details = {"Total Male": [total_male],
             "Total Female": [total_female],
             "Total male pass" : [total_male_pass],
             "Total female pass" : [total_female_pass],
             "% of Passed Male" : [pass_male_percentage],
             "% of Passed Female" : [pass_female_percentage]}
details

In [36]:
gender_wise = pd.DataFrame(details, index=["Detail"])
gender_wise.T

In [43]:
fig = px.histogram(data_frame = df,
             x = "Stream",
             color="PlacedOrNot", title="<b>Counts of Stream</b>"
             )

fig.show()

In [22]:
backlog = df['HistoryOfBacklogs']==1
type(backlog)

In [23]:
df[backlog].groupby('Gender').count()

In [24]:
df[backlog].groupby('PlacedOrNot').count()

In [44]:
stream_wise = df.groupby('Gender').agg({'Age':'mean',
                                          'Internships' : 'sum',                            
                                           "CGPA":'mean',
                                           'PlacedOrNot':'sum'})

stream_wise.style.highlight_max()

In [47]:
stream_wise = df.groupby('Stream').agg({'Age':'mean',
                                          'Internships' : 'sum',                            
                                           "CGPA":'mean',
                                           'PlacedOrNot':'sum'
                                       })

stream_wise.style.highlight_max()

In [54]:
no_internship = df[df['Internships'] == 0]

no_internship

In [55]:
fig = px.histogram(data_frame = no_internship,
                   x = "PlacedOrNot",
                   color="PlacedOrNot",
                   title = "<b>No Internship Experience Vs Placement</b>")

fig.update_layout(bargap=0.2)

fig.show()

# One Hot Encoding

In [57]:
dummy_gender = pd.get_dummies(df['Gender'])
dummy_stream = pd.get_dummies(df['Stream'])

In [58]:
data = pd.concat([df.drop(["Gender", "Stream"], axis = 1), dummy_gender, dummy_stream], axis = 1)

data

# Scaling features

In [61]:
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, plot_confusion_matrix
from datetime import datetime

# Arranging the data

In [64]:
data = data[['Age', 'Male', 'Female',
             'Electronics And Communication',
             'Computer Science', 'Information Technology',
             'Mechanical', 'Electrical', "Civil",
             "Internships","CGPA",'Hostel',
             'HistoryOfBacklogs', 'PlacedOrNot']]

data

In [65]:
scaler = StandardScaler()

scaler.fit(data.drop('PlacedOrNot',axis=1))

scaled_features = scaler.transform(data.drop('PlacedOrNot',axis=1))

In [66]:
scaled_features = pd.DataFrame(scaled_features, columns = data.columns[:-1])
scaled_features.head()

# Visualize coorelation of independent feature with dependent

In [71]:
corrmat = data.corr()
# print(corrmat)
top_corr_features = corrmat.index
# print(top_corr_features)
plt.figure(figsize=(20,15))

# plot heat map
g=sns.heatmap(data[top_corr_features].corr(),annot=True,cmap="RdYlGn")

Internship and CGPA is highly coorelated with dependent feature i.e PlacedOrNot

🧾 Train test split

In [91]:
Y = data["PlacedOrNot"]
X = data.drop("PlacedOrNot",axis = 1)

In [92]:
X_train_full, X_valid_full, Y_train, Y_valid = train_test_split(X, Y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

In [93]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score, roc_auc_score
# Define the model
clf = XGBClassifier(learning_rate=0.09, 
                    n_estimators=100, 
                    use_label_encoder=False,
                    random_state=42)
    
clf.fit(X_train_full, Y_train, eval_metric='logloss')
predictions = clf.predict(X_valid_full) 
print("accuracy_score: " + str(accuracy_score(Y_valid, predictions)))

In [94]:
print("f1_score: " + str(f1_score(Y_valid, predictions)))


In [95]:
#Randomized Search CV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
# lr
learning_rate = [ float(x) for x in [0.9, 0.1, 0.09, 0.01]]

In [96]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'learning_rate': learning_rate,
               }

print(random_grid)

In [97]:
clf2 = XGBClassifier()

In [98]:
model = RandomizedSearchCV(estimator = clf2, param_distributions = random_grid, n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs = 1)

In [99]:
model.fit(X_train_full, Y_train, eval_metric='logloss')

In [100]:
predictions2 = model.predict(X_valid_full) 
print("accuracy_score: " + str(accuracy_score(Y_valid, predictions2)))

# Making Predictions
### The section for realtime input

In [114]:

# print("Enter Age : ")
# age=int(input())
# print("\nEnter Gender(M/F) : ")
# gen=input()
# if(gen=='M'):
#     gender=1
#     gender=int(gender)
# elif(gen=='F'):
#     gender=0
#     gender=int(gender)
# print("\nEnter Stream(ECE/CS/IT/MEC/ELE/CIV) : ")
# stre=input()
# if(stre=='ECE'):
#     stream=3
# elif(stre=='CS'):
#     stream=1
# elif(stre=='IT'):
#     stream=4
# elif(stre=='MEC'):
#     stream=5
# elif(stre=='ELE'):
#     stream=2
# elif(stre=='CIV'):
#     stream=0
# stream=int(stream)
# print("\nEnter Number of Internships Done : ")
# intern = int(input())
# print("\nEnter CGPA : ")
# cgpa=int(input())
# print("\nEnter Hostel Accommodation : ")
# hostel=int(input())
# print("\nEnter number of backlogs : ")
# back=int(input())
# pred = clf.predict(np.array([[age,gender,stream,intern,cgpa,hostel,back]]))
# if(pred==1):
#     print("\nYou have high chances of getting placed")
# else:
#     print("\nYou have high chances of getting placed. Work more Hard!")