In [1]:
import pandas as pd

students = pd.read_csv("/content/pm_internship_students_final.csv")
internships = pd.read_csv("/content/pm_internship_internships_mergedV1.csv")
allocations = pd.read_csv("/content/pm_internship_allocations.csv")

print("Students Columns:\n", students.columns)
print("\nInternships Columns:\n", internships.columns)
print("\nAllocations Columns:\n", allocations.columns)

print("\nMissing values (students):\n", students.isnull().sum())
print("\nMissing values (internships):\n", internships.isnull().sum())
print("\nMissing values (allocations):\n", allocations.isnull().sum())


Students Columns:
 Index(['Student_ID', 'Name', 'Gender', 'Age', 'Education_Level', 'Discipline',
       'Skills', 'Preferred_Location', 'Fallback_City', 'Sector_Interest',
       'Category', 'District_Type', 'CGPA', 'Percentage', 'Availability',
       'Preferred_Mode', 'Languages_Known', 'Experience_Level',
       'Project_Experience', 'Num_Projects', 'Aptitude_Score',
       'Soft_Skills_Rating', 'Relocation_Willingness', 'Smart_City_Preference',
       'Smart_Domain_Interest'],
      dtype='object')

Internships Columns:
 Index(['Internship_ID', 'Company_Name', 'Sector', 'Smart_City_Assigned',
       'Fallback_City', 'Domain', 'Required_Skills', 'Required_Language',
       'Location', 'Capacity', 'Reserved_Seats', 'Rural_Urban_Distribution',
       'Duration', 'Stipend', 'Mode', 'Type', 'Minimum_Qualification',
       'Min_CGPA', 'Eligibility_Stream', 'Application_Deadline'],
      dtype='object')

Allocations Columns:
 Index(['Student_ID', 'Name', 'Category', 'CGPA', 'Allocated_In

In [2]:
# Filter Students
students_clean = students[[
    "Student_ID", "Skills", "Discipline", "Education_Level",
    "Preferred_Location", "Category", "CGPA",
    "Experience_Level", "Project_Experience", "Num_Projects"
]]

# Filter Internships
intern_clean = internships[[
    "Internship_ID", "Required_Skills", "Sector", "Domain",
    "Location", "Mode", "Duration", "Stipend", "Capacity"
]]

# Filter Allocations
alloc_clean = allocations[[
    "Student_ID", "Allocated_Internship", "CGPA", "Category"
]]

students_clean.head(), intern_clean.head(), alloc_clean.head()


(  Student_ID                                    Skills   Discipline  \
 0     S00001                            Blockchain, ML  Engineering   
 1     S00002  App Dev, ML, Cybersecurity, Data Science     Commerce   
 2     S00003                                       IoT  Engineering   
 3     S00004              Cloud, Finance, Data Science      Science   
 4     S00005                              Data Science      Science   
 
   Education_Level Preferred_Location Category  CGPA Experience_Level  \
 0              PG              Noida      GEN  8.60         2+ years   
 1              UG           Durgapur       ST  9.43        1-2 years   
 2              UG             Jaipur       SC  6.42        1-2 years   
 3              UG              Delhi      OBC  8.12        1-2 years   
 4              PG             Jaipur      OBC  9.59         2+ years   
 
   Project_Experience  Num_Projects  
 0                 No             0  
 1                 No             2  
 2          

In [3]:
def clean_skill_string(s):
    s = s.lower()
    parts = [x.strip().replace(" ", "_") for x in s.split(",")]
    return parts

students_clean["Skills_List"] = students_clean["Skills"].apply(clean_skill_string)
intern_clean["Required_Skills_List"] = intern_clean["Required_Skills"].apply(clean_skill_string)

students_clean[["Skills", "Skills_List"]].head(), intern_clean[["Required_Skills", "Required_Skills_List"]].head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  students_clean["Skills_List"] = students_clean["Skills"].apply(clean_skill_string)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  intern_clean["Required_Skills_List"] = intern_clean["Required_Skills"].apply(clean_skill_string)


(                                     Skills  \
 0                            Blockchain, ML   
 1  App Dev, ML, Cybersecurity, Data Science   
 2                                       IoT   
 3              Cloud, Finance, Data Science   
 4                              Data Science   
 
                                   Skills_List  
 0                            [blockchain, ml]  
 1  [app_dev, ml, cybersecurity, data_science]  
 2                                       [iot]  
 3              [cloud, finance, data_science]  
 4                              [data_science]  ,
                           Required_Skills  \
 0    SQL, Cybersecurity, Finance, App Dev   
 1                                      AI   
 2  Data Science, Cybersecurity, Cloud, AI   
 3           Cybersecurity, Cloud, C++, ML   
 4                  ML, SQL, Cybersecurity   
 
                        Required_Skills_List  
 0    [sql, cybersecurity, finance, app_dev]  
 1                                      [ai

In [4]:
# Collect all unique skills from both students and internships
all_skills = set()

for skills in students_clean["Skills_List"]:
    all_skills.update(skills)

for skills in intern_clean["Required_Skills_List"]:
    all_skills.update(skills)

# Create vocabulary
skill_to_id = {skill: idx + 1 for idx, skill in enumerate(sorted(all_skills))}
id_to_skill = {idx: skill for skill, idx in skill_to_id.items()}

len(skill_to_id), list(skill_to_id.items())[:20]


(16,
 [('ai', 1),
  ('app_dev', 2),
  ('blockchain', 3),
  ('c++', 4),
  ('cloud', 5),
  ('cybersecurity', 6),
  ('data_science', 7),
  ('finance', 8),
  ('iot', 9),
  ('java', 10),
  ('marketing', 11),
  ('ml', 12),
  ('public_policy', 13),
  ('python', 14),
  ('sql', 15),
  ('web_dev', 16)])

In [5]:
# Convert skill list to sequence of IDs
def skills_to_ids(skill_list):
    return [skill_to_id[s] for s in skill_list if s in skill_to_id]

students_clean["Skill_Seq"] = students_clean["Skills_List"].apply(skills_to_ids)
intern_clean["ReqSkill_Seq"] = intern_clean["Required_Skills_List"].apply(skills_to_ids)

students_clean[["Skills_List", "Skill_Seq"]].head(), intern_clean[["Required_Skills_List", "ReqSkill_Seq"]].head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  students_clean["Skill_Seq"] = students_clean["Skills_List"].apply(skills_to_ids)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  intern_clean["ReqSkill_Seq"] = intern_clean["Required_Skills_List"].apply(skills_to_ids)


(                                  Skills_List      Skill_Seq
 0                            [blockchain, ml]        [3, 12]
 1  [app_dev, ml, cybersecurity, data_science]  [2, 12, 6, 7]
 2                                       [iot]            [9]
 3              [cloud, finance, data_science]      [5, 8, 7]
 4                              [data_science]            [7],
                        Required_Skills_List   ReqSkill_Seq
 0    [sql, cybersecurity, finance, app_dev]  [15, 6, 8, 2]
 1                                      [ai]            [1]
 2  [data_science, cybersecurity, cloud, ai]   [7, 6, 5, 1]
 3           [cybersecurity, cloud, c++, ml]  [6, 5, 4, 12]
 4                  [ml, sql, cybersecurity]    [12, 15, 6])

In [6]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Combine all lengths to find maximum
all_lengths = [len(seq) for seq in students_clean["Skill_Seq"]] + \
              [len(seq) for seq in intern_clean["ReqSkill_Seq"]]

max_len = max(all_lengths)
max_len


4

In [7]:
students_clean["Skill_Seq_Padded"] = list(pad_sequences(students_clean["Skill_Seq"], maxlen=max_len, padding='post'))
intern_clean["ReqSkill_Seq_Padded"] = list(pad_sequences(intern_clean["ReqSkill_Seq"], maxlen=max_len, padding='post'))

students_clean[["Skill_Seq", "Skill_Seq_Padded"]].head(), intern_clean[["ReqSkill_Seq", "ReqSkill_Seq_Padded"]].head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  students_clean["Skill_Seq_Padded"] = list(pad_sequences(students_clean["Skill_Seq"], maxlen=max_len, padding='post'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  intern_clean["ReqSkill_Seq_Padded"] = list(pad_sequences(intern_clean["ReqSkill_Seq"], maxlen=max_len, padding='post'))


(       Skill_Seq Skill_Seq_Padded
 0        [3, 12]    [3, 12, 0, 0]
 1  [2, 12, 6, 7]    [2, 12, 6, 7]
 2            [9]     [9, 0, 0, 0]
 3      [5, 8, 7]     [5, 8, 7, 0]
 4            [7]     [7, 0, 0, 0],
     ReqSkill_Seq ReqSkill_Seq_Padded
 0  [15, 6, 8, 2]       [15, 6, 8, 2]
 1            [1]        [1, 0, 0, 0]
 2   [7, 6, 5, 1]        [7, 6, 5, 1]
 3  [6, 5, 4, 12]       [6, 5, 4, 12]
 4    [12, 15, 6]      [12, 15, 6, 0])

In [8]:
# Merge allocation with student + internship embeddings
positive_pairs = alloc_clean.merge(students_clean[["Student_ID", "Skill_Seq_Padded"]], on="Student_ID")
positive_pairs = positive_pairs.merge(intern_clean[["Internship_ID", "ReqSkill_Seq_Padded"]],
                                      left_on="Allocated_Internship",
                                      right_on="Internship_ID")

positive_pairs["label"] = 1
positive_pairs.head()


Unnamed: 0,Student_ID,Allocated_Internship,CGPA,Category,Skill_Seq_Padded,Internship_ID,ReqSkill_Seq_Padded,label
0,S00004,I0032,8.12,OBC,"[5, 8, 7, 0]",I0032,"[16, 1, 0, 0]",1
1,S00005,I0024,9.59,OBC,"[7, 0, 0, 0]",I0024,"[9, 0, 0, 0]",1
2,S00016,I0027,8.63,SC,"[7, 1, 0, 0]",I0027,"[12, 10, 14, 15]",1
3,S00023,I0014,7.81,GEN,"[15, 11, 10, 0]",I0014,"[14, 0, 0, 0]",1
4,S00030,I0020,5.41,ST,"[12, 6, 2, 0]",I0020,"[7, 16, 0, 0]",1


In [9]:
import random
import pandas as pd

negative_samples = []

student_ids = students_clean["Student_ID"].tolist()
internship_ids = intern_clean["Internship_ID"].tolist()

# same number of negatives as positives
for i in range(len(positive_pairs)):
    s = random.choice(student_ids)
    j = random.choice(internship_ids)
    negative_samples.append([s, j, 0])  # label 0

negative_df = pd.DataFrame(negative_samples, columns=["Student_ID", "Internship_ID", "label"])


In [10]:
negative_df.head()


Unnamed: 0,Student_ID,Internship_ID,label
0,S04154,I1855,0
1,S01687,I0804,0
2,S04120,I1503,0
3,S02900,I0181,0
4,S04215,I2323,0


In [11]:
# attach student skill sequences
negative_df = negative_df.merge(
    students_clean[["Student_ID", "Skill_Seq_Padded"]],
    on="Student_ID",
    how="left"
)

# attach internship required skill sequences
negative_df = negative_df.merge(
    intern_clean[["Internship_ID", "ReqSkill_Seq_Padded"]],
    on="Internship_ID",
    how="left"
)

negative_df.head()


Unnamed: 0,Student_ID,Internship_ID,label,Skill_Seq_Padded,ReqSkill_Seq_Padded
0,S04154,I1855,0,"[3, 14, 4, 0]","[6, 2, 0, 0]"
1,S01687,I0804,0,"[1, 6, 7, 0]","[5, 9, 0, 0]"
2,S04120,I1503,0,"[2, 4, 0, 0]","[7, 11, 0, 0]"
3,S02900,I0181,0,"[6, 12, 8, 0]","[4, 16, 2, 14]"
4,S04215,I2323,0,"[16, 3, 10, 1]","[2, 16, 13, 12]"


In [12]:
train_df = pd.concat(
    [
        positive_pairs[["Skill_Seq_Padded", "ReqSkill_Seq_Padded", "label"]],
        negative_df[["Skill_Seq_Padded", "ReqSkill_Seq_Padded", "label"]]
    ],
    ignore_index=True
)

train_df.head(), train_df.shape


(  Skill_Seq_Padded ReqSkill_Seq_Padded  label
 0     [5, 8, 7, 0]       [16, 1, 0, 0]      1
 1     [7, 0, 0, 0]        [9, 0, 0, 0]      1
 2     [7, 1, 0, 0]    [12, 10, 14, 15]      1
 3  [15, 11, 10, 0]       [14, 0, 0, 0]      1
 4    [12, 6, 2, 0]       [7, 16, 0, 0]      1,
 (2154, 3))

In [13]:
import numpy as np
from sklearn.model_selection import train_test_split

X_student = np.array(train_df["Skill_Seq_Padded"].tolist())
X_intern  = np.array(train_df["ReqSkill_Seq_Padded"].tolist())
y         = np.array(train_df["label"].tolist())

Xs_train, Xs_val, Xi_train, Xi_val, y_train, y_val = train_test_split(
    X_student, X_intern, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

Xs_train.shape, Xi_train.shape, y_train.shape, Xs_val.shape, Xi_val.shape, y_val.shape


((1723, 4), (1723, 4), (1723,), (431, 4), (431, 4), (431,))

In [14]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Concatenate
from tensorflow.keras.models import Model

vocab_size = len(skill_to_id) + 1   # +1 for padding (0)
seq_len = X_student.shape[1]        # padded length

# Student input
student_input = Input(shape=(seq_len,))
student_embed = Embedding(input_dim=vocab_size, output_dim=32, mask_zero=True)(student_input)
student_lstm = LSTM(32)(student_embed)

# Internship input
intern_input = Input(shape=(seq_len,))
intern_embed = Embedding(input_dim=vocab_size, output_dim=32, mask_zero=True)(intern_input)
intern_lstm = LSTM(32)(intern_embed)

# Combine
combined = Concatenate()([student_lstm, intern_lstm])
dense1 = Dense(32, activation='relu')(combined)
output = Dense(1, activation='sigmoid')(dense1)

model = Model(inputs=[student_input, intern_input], outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()


In [15]:
history = model.fit(
    [Xs_train, Xi_train],
    y_train,
    validation_data=([Xs_val, Xi_val], y_val),
    epochs=10,
    batch_size=32
)


Epoch 1/10
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 22ms/step - accuracy: 0.5291 - loss: 0.6921 - val_accuracy: 0.6589 - val_loss: 0.6844
Epoch 2/10
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.7030 - loss: 0.6693 - val_accuracy: 0.7053 - val_loss: 0.6098
Epoch 3/10
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.7369 - loss: 0.5589 - val_accuracy: 0.7309 - val_loss: 0.5167
Epoch 4/10
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.7822 - loss: 0.4772 - val_accuracy: 0.7680 - val_loss: 0.4843
Epoch 5/10
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.7797 - loss: 0.4671 - val_accuracy: 0.7657 - val_loss: 0.4573
Epoch 6/10
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.7901 - loss: 0.4425 - val_accuracy: 0.8028 - val_loss: 0.4334
Epoch 7/10
[1m54/54[0m [32m━━━━

In [16]:
model.save("internship_recommender_lstm.h5")




In [17]:
model.save("internship_recommender.keras")


In [18]:
import numpy as np

def recommend_internships(student_id, top_k=5):
    # get student sequence
    student_row = students_clean[students_clean["Student_ID"] == student_id]

    if student_row.empty:
        return "Student not found."

    student_seq = np.array(student_row["Skill_Seq_Padded"].tolist() * len(intern_clean))

    # get all internship sequences
    intern_seq = np.array(intern_clean["ReqSkill_Seq_Padded"].tolist())

    # predict scores
    scores = model.predict([student_seq, intern_seq], verbose=0).flatten()

    # sort and pick top-k
    top_indices = scores.argsort()[::-1][:top_k]

    recommendations = []
    for idx in top_indices:
        rec = {
            "Internship_ID": intern_clean.iloc[idx]["Internship_ID"],
            "Required_Skills": intern_clean.iloc[idx]["Required_Skills_List"],
            "Score": float(scores[idx])
        }
        recommendations.append(rec)

    return recommendations


In [19]:
recommend_internships("S00004")


[{'Internship_ID': 'I1484',
  'Required_Skills': ['sql', 'ml', 'data_science', 'cybersecurity'],
  'Score': 0.9601829648017883},
 {'Internship_ID': 'I1933',
  'Required_Skills': ['marketing', 'ml', 'finance', 'cybersecurity'],
  'Score': 0.9469504356384277},
 {'Internship_ID': 'I2396',
  'Required_Skills': ['finance', 'ml', 'cloud', 'cybersecurity'],
  'Score': 0.9463256597518921},
 {'Internship_ID': 'I2556',
  'Required_Skills': ['marketing', 'cybersecurity', 'data_science', 'ml'],
  'Score': 0.9447588920593262},
 {'Internship_ID': 'I2618',
  'Required_Skills': ['ml', 'marketing', 'iot', 'cybersecurity'],
  'Score': 0.9414997696876526}]