<a href="https://colab.research.google.com/github/jarryMin/mot_jm/blob/main/AI_similar.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

import

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
#!unzip data/open.zip -d data/
#!pip install transformers datasets
# https://wikidocs.net/166806

Load Train / Test dataset

In [None]:
train = pd.read_csv("/content/drive/MyDrive/similar/sample_train.csv")
train.head()

In [None]:
test = pd.read_csv("/content/drive/MyDrive/similar/test.csv")
test.head()

Define Model (CountVectorizer+CosineSimilarity)

In [None]:
class BaselineModel():
    def __init__(self, threshold=0.5):
        super(BaselineModel, self).__init__()
        self.threshold = threshold # 유사도 임계값
        self.vectorizer = CountVectorizer()
        
    def fit(self, code1, code2):
        # 입력 받은 코드 쌍으로 부터 vectorizer를 fit 시킵니다.
        self.vectorizer.fit(code1)
        self.vectorizer.fit(code2)
        print('Done.')
    
    def predict_proba(self, code1, code2):
        # 입력 받은 코드 쌍으로 부터 vectorizer를 통해 vector화 합니다.
        code1_vecs = self.vectorizer.transform(code1)
        code2_vecs = self.vectorizer.transform(code2)
        
        preds = []
        # 각각의 코드 쌍(=벡터 쌍)으로부터 cosine-similarity를 구합니다.
        for code1_vec, code2_vec in zip(code1_vecs, code2_vecs):
            preds.append(cosine_similarity(code1_vec, code2_vec))
        
        preds = np.reshape(preds, len(preds))
        print('Done.')
        # 각 코드 쌍들의 유사도를 반환
        return preds
    
    def predict(self, code1, code2):
        preds = self.predict_proba(code1, code2)
        # cosine-similarity (유사도)가 설정한 임계값(Threshold=0.5)보다 높다면 유사하다 : 1, 아니라면 유사하지 않다 : 0
        preds = np.where(preds>self.threshold, 1, 0)
        # 각 코드 쌍들의 유사도를 Threshold를 통해 유사함을 판별 (이진분류)
        return preds

Model(Vectorizer) Fit

In [None]:
# 모델 선언
model = BaselineModel(threshold=0.5)
# 학습 코드 쌍들로부터 Model을 Fitting
model.fit(train['code1'], train['code2'])

infernece

In [None]:
# 모델 추론
preds = model.predict(test['code1'], test['code2'])

Submission

In [8]:
submission = pd.read_csv('/content/drive/MyDrive/similar/sample_submission.csv')
submission['similar'] = preds
submission.to_csv('/content/drive/MyDrive/similar/submission.csv', index=False)