In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
data = pd.read_csv("preprocessed_spec.csv")

In [None]:
# Creating a DataFrame for user specifications
user_spec = {
    '평균 학점': 3.72,
    '평균 토익점수': 833,
    '평균 외국어 개수': 1.2,
    '평균 자격증 개수': 2.2,
    '평균 해외경험 횟수': 1.3,
    '평균 인턴경험 횟수': 1.3,
    '평균 수상 횟수': 2.2,
    '평균 봉사 횟수': 2
}
user_spec_df = pd.DataFrame(user_spec, index=[0])

In [None]:
# Extracting only the necessary columns from the dataset
data_scaled = data.drop('기업명', axis=1)

In [None]:
# Scaling the data using Min-Max Scaler
scaler = MinMaxScaler()
data_scaled = pd.DataFrame(scaler.fit_transform(data_scaled), columns=data_scaled.columns)
user_spec_scaled = scaler.transform(user_spec_df)

In [None]:
# Calculating cosine similarity between the scaled dataset and user specifications
cosine_sim = cosine_similarity(data_scaled, user_spec_scaled)

In [None]:
# Converting the similarity scores to a Series
sim_scores = pd.Series(cosine_sim.flatten())

# Sorting the similarity scores in descending order
sim_scores_sorted = sim_scores.sort_values(ascending=False)

# Extracting the top 5 indexes with the highest similarity scores
top_indexes = sim_scores_sorted.iloc[:5].index

# Retrieving the company names corresponding to the top indexes
top_companies = data['기업명'].iloc[top_indexes]

print(top_companies)


0        삼성전자
275     웅진씽크빅
2925    한국마사회
764      한솔교육
48      롯데케미칼
Name: 기업명, dtype: object
