# 데이터 불러오기 + 전처리

In [None]:
# 강의계획서 데이터 불러오기
import os

path = '/content/drive/Shareddrives/NLP모델링/강의계획서/강의계획서 데이터'
file_list = os.listdir(path)

In [None]:
# 전체 데이터를 한 데이터프레임으로 합치기
import pandas as pd
import numpy as np

df = pd.DataFrame()

for f in file_list:
    data = pd.read_csv(path + '/' + f)
    df = pd.concat([df, data])

df.drop(["Unnamed: 0", "강의개요"], axis=1, inplace=True)
df = df.reindex(columns = ["대분류", "학정번호", "강의명", "교수명", "강의개요한국어", "유의사항"])
df = df.rename(columns = {"강의개요한국어":"강의개요"})
df.set_index("학정번호", inplace=True)

df.shape

(2593, 5)

In [None]:
df.head()

Unnamed: 0_level_0,대분류,강의명,교수명,강의개요,유의사항
학정번호,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
YCA1007-01-00,교양기초,채플(C),"이대성,정종훈,정미현",,"100%온라인:동영상콘텐츠강의, 수강대상: 2015학번 이후, P/NP평가 과목"
YCA1009-01-00,교양기초,예배채플(A),"정종훈,이대성,정미현",채플 담당 목사,"대면강의, 수강대상: 2015학번 이후, P/NP평가 과목"
YCA1101-12-00,교양기초,기독교와세계문화,김주환,"본 강의는 크게 ‘성서’, ‘기독교’, ‘문화’라는 세 가지 주제를 중심으로 이루어...","100%온라인:동영상-실시간혼합강의, 절대평가"
YCA1102-14-00,교양기초,기독교와현대사회,조미영,"기독교 정신을 소개함으로써 ‘진리와 자유’라는 본교의 건학 이념을 이해하고, 기독교...","100%온라인:실시간온라인강의, 절대평가"
YCA1103-08-00,교양기초,성서와기독교,정미현,1. 연세대학교의 건학 정신에 깃든 성서와 기독교 사상의 핵심가치를 이해한다. 2....,"블랜딩:대면-비대면(동영상콘텐츠), 절대평가, 사회혁신가 인증 교과목"


# 형태소 분석

In [None]:
!pip install konlpy     # 한국어 정보처리를 위한 패키지



In [None]:
# 형태소 분석기로 명사/형용사/동사만 추출한 문서 만들기
from konlpy.tag import Okt  # 트위터 형태소 분석기를 계승하는 프로젝트
okt = Okt()

outline_list = df.loc[:, '강의개요'].to_list() # 강의개요 열만 list로 추출
doc_list = []   # tokenized_doc: 전체 단어에 대한 (단어:품사) 리스트
word_list = []  # tokenized_word: 명사/형용사/동사만 추출

for i, outline in enumerate(outline_list):
    tokenized_doc = okt.pos(str(outline))
    tokenized_word = ' '.join([word[0] for word in tokenized_doc if word[1] in ["Noun", "Adjective", "Verb"]])
    doc_list.append(tokenized_doc)
    word_list.append(tokenized_word)

In [None]:
# tokenized_doc
df_doc = pd.DataFrame({"tokenized_doc":doc_list})
df_doc.index = df.index
df_doc.head()

Unnamed: 0_level_0,tokenized_doc
학정번호,Unnamed: 1_level_1
YCA1007-01-00,"[(nan, Alpha)]"
YCA1009-01-00,"[(채플, Noun), (담당, Noun), (목사, Noun)]"
YCA1101-12-00,"[(본, Verb), (강의, Noun), (는, Josa), (크게, Noun),..."
YCA1102-14-00,"[(기독교, Noun), (정신, Noun), (을, Josa), (소개, Noun..."
YCA1103-08-00,"[(1, Number), (., Punctuation), (연세대학교, Noun),..."


In [None]:
# tokenized_word
df_word = pd.DataFrame({"tokenized_word":word_list})
df_word.index = df.index
df_word.head()

Unnamed: 0_level_0,tokenized_word
학정번호,Unnamed: 1_level_1
YCA1007-01-00,
YCA1009-01-00,채플 담당 목사
YCA1101-12-00,본 강의 크게 성서 기독교 문화 세 가지 주제 중심 이루어집니다 성서 기독교 종교 ...
YCA1102-14-00,기독교 정신 소개 함 진리 자유 본교 건학 이념 이해 기독교 사회 관계 대한 인문학...
YCA1103-08-00,연세대학교 건학 정신 깃 성서 기독교 사상 핵심 가치 이해 성서 기독교 사상 하나님...


# 최빈 단어 파악을 위한 Bag of Word

In [None]:
# Bag of Word
from konlpy.tag import Okt
okt = Okt()

def build_bag_of_words(document):
  # 온점 제거 및 형태소 분석
  document = document.replace('.', '')
  tokenized_document = okt.morphs(document)

  word_to_index = {}
  bow = []

  for word in tokenized_document:  
    if word not in word_to_index.keys():
      word_to_index[word] = len(word_to_index)  
      # BoW에 전부 기본값 1을 넣는다.
      bow.insert(len(word_to_index) - 1, 1)
    else:
      # 재등장하는 단어의 인덱스
      index = word_to_index.get(word)
      # 재등장한 단어는 해당하는 인덱스의 위치에 1을 더한다.
      bow[index] = bow[index] + 1

  return word_to_index, bow

In [None]:
# 전체 강의평에 대한 Bag of Word: 최빈 단어 파악
fullword = ' '.join(word_list)

doc = str(fullword)
vocab, bow = build_bag_of_words(doc)

print('vocabulary :', vocab)
print('bag of words vector :', bow)

from pandas import Series, DataFrame
bow_df = DataFrame({"nouns" : list(vocab.keys()), "bow" : list(bow)})
bow_df = bow_df.sort_values("bow", ascending=False)
bow_df.head(20)

vocabulary : {'채플': 0, '담당': 1, '목사': 2, '본': 3, '강의': 4, '크게': 5, '성서': 6, '기독교': 7, '문화': 8, '세': 9, '가지': 10, '주제': 11, '중심': 12, '이루어집니다': 13, '종교': 14, '핵심': 15, '있고': 16, '우리': 17, '마주': 18, '하는': 19, '세상': 20, '다양한': 21, '속': 22, '녹아': 23, '들어가': 24, '있습니다': 25, '또한': 26, '만들어': 27, '온': 28, '생각': 29, '가까이': 30, '경험': 31, '되는': 32, '요소': 33, '합니다': 34, '기독교인': 35, '아니라고': 36, '하더라도': 37, '관용': 38, '표현': 39, '예술': 40, '한번': 41, '하게': 42, '됩니다': 43, '따라서': 44, '어떤': 45, '책': 46, '또': 47, '어떻게': 48, '해석': 49, '해': 50, '왔는지': 51, '형성': 52, '된': 53, '역사': 54, '영향': 55, '주었는지': 56, '살펴봄으로써': 57, '근간': 58, '가치관': 59, '재': 60, '발견': 61, '될': 62, '것': 63, '입니다': 64, '이를': 65, '통해': 66, '정신': 67, '설립': 68, '연세대학교': 69, '재학': 70, '중인': 71, '글로벌': 72, '사회': 73, '인류': 74, '보편': 75, '가치': 76, '추구': 77, '할': 78, '수': 79, '있을지': 80, '기회': 81, '가질': 82, '소개': 83, '함': 84, '진리': 85, '자유': 86, '본교': 87, '건학': 88, '이념': 89, '이해': 90, '관계': 91, '대한': 92, '인문학': 93, '탐색': 94, '비판': 95, '성찰': 96, '살아감

Unnamed: 0,nouns,bow
103,및,2719
19,하는,2383
90,이해,2036
63,것,1810
79,수,1787
92,대한,1738
78,할,1632
165,이,1541
145,과정,1403
131,한다,1391


# TF-IDF

## DTM
- DTM: Document-Term Matrix. BoW을 행렬 형태로 표현한 방법

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = df_word.loc[:, "tokenized_word"].to_list()

vector = CountVectorizer()

# DTM: 코퍼스(말뭉치)로부터 각 단어의 빈도수를 기록
print(vector.fit_transform(corpus).toarray())

# 각 단어와 맵핑된 인덱스 출력
print(vector.vocabulary_)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
{'채플': 7961, '담당': 1895, '목사': 2897, '강의': 229, '크게': 8353, '성서': 4376, '기독교': 1164, '문화': 3004, '가지': 112, '주제': 7526, '중심': 7588, '이루어집니다': 6344, '종교': 7428, '핵심': 9208, '있고': 6724, '우리': 5933, '마주': 2588, '하는': 8932, '세상': 4445, '다양한': 1803, '녹아': 1592, '들어가': 2333, '있습니다': 6752, '또한': 2422, '만들어': 2629, '생각': 4197, '가까이': 11, '경험': 514, '되는': 2202, '요소': 5893, '합니다': 9094, '기독교인': 1165, '아니라고': 5077, '하더라도': 8946, '관용': 756, '표현': 8787, '예술': 5722, '한번': 9057, '하게': 8911, '됩니다': 2271, '따라서': 2390, '어떤': 5364, '어떻게': 5367, '해석': 9168, '왔는지': 5836, '형성': 9324, '역사': 5553, '영향': 5701, '주었는지': 7513, '살펴봄으로써': 4097, '근간': 1074, '가치관': 131, '발견': 3256, '입니다': 6696, '이를': 6365, '통해': 8516, '정신': 7259, '설립': 4334, '연세대학교': 5609, '재학': 7005, '중인': 7616, '글로벌': 1095, '사회': 4030, '인류': 6526, '보편': 3596, '가치': 130, '추구': 8123, '있을지': 6774, '기회': 1252, '가질': 121, '소개': 4480, '

## TF-IDF 1) sklearn의 TfidfVectorizer 이용
   -> 나는 데이터프레임을 원한다 -_-

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = df_word.loc[:, "tokenized_word"].to_list()

tfidfv = TfidfVectorizer().fit(corpus)

# TF-IDF 행렬 출력
print(tfidfv.transform(corpus).toarray())

# 각 단어와 맵핑된 인덱스 출력
print(tfidfv.vocabulary_)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
{'채플': 7961, '담당': 1895, '목사': 2897, '강의': 229, '크게': 8353, '성서': 4376, '기독교': 1164, '문화': 3004, '가지': 112, '주제': 7526, '중심': 7588, '이루어집니다': 6344, '종교': 7428, '핵심': 9208, '있고': 6724, '우리': 5933, '마주': 2588, '하는': 8932, '세상': 4445, '다양한': 1803, '녹아': 1592, '들어가': 2333, '있습니다': 6752, '또한': 2422, '만들어': 2629, '생각': 4197, '가까이': 11, '경험': 514, '되는': 2202, '요소': 5893, '합니다': 9094, '기독교인': 1165, '아니라고': 5077, '하더라도': 8946, '관용': 756, '표현': 8787, '예술': 5722, '한번': 9057, '하게': 8911, '됩니다': 2271, '따라서': 2390, '어떤': 5364, '어떻게': 5367, '해석': 9168, '왔는지': 5836, '형성': 9324, '역사': 5553, '영향': 5701, '주었는지': 7513, '살펴봄으로써': 4097, '근간': 1074, '가치관': 131, '발견': 3256, '입니다': 6696, '이를': 6365, '통해': 8516, '정신': 7259, '설립': 4334, '연세대학교': 5609, '재학': 7005, '중인': 7616, '글로벌': 1095, '사회': 4030, '인류': 6526, '보편': 3596, '가치': 130, '추구': 8123, '있을지': 6774, 

## TF-IDF 2) 직접 계산

In [None]:
from math import log

def tfidf(doc):
    doc_list = []
    tf = pd.DataFrame()
    idf = pd.DataFrame()
    tf_idf = pd.DataFrame()
    
    # 단어 리스트 생성
    for i in doc:
        # 단어 분해
        tmp_list = str(i).split(' ')
        # 리스트 결합
        doc_list += tmp_list
    doc_list = list(set(doc_list))
    
    # DF
    df = []
    for i in doc_list:
        tmp = 0
        for j in doc:
            # 단어 분해
            tmp_list = list(set(j.split(' ')))
            if i in tmp_list:
                tmp += 1
        df.append(tmp)
        
    # TF(DTM), IDF, TF-IDF
    for i in range(len(doc_list)):
        tmp = []
        tmp2 = []
        tmp3 = []
        for j in doc:
            # 단어 분해
            tmp_list = j.split(' ')
            # 단어 세기
            tmp.append(tmp_list.count(doc_list[i]))
            tmp2.append(log(len(doc) / (df[i] + 1)))
            tmp3.append((tmp_list.count(doc_list[i])) * (log(len(doc) / (df[i] + 1))))
        # 데이터 프레임 추가
        tf[doc_list[i]] = tmp
        idf[doc_list[i]] = tmp2
        tf_idf[doc_list[i]] = tmp3

    return tf, df, idf, tf_idf

In [None]:
doc = tuple(df_word.loc[:, "tokenized_word"])
tf, df, idf, tf_idf = tfidf(doc)



In [None]:
df = pd.DataFrame()

for f in file_list:
    data = pd.read_csv(path + '/' + f)
    df = pd.concat([df, data])

df.drop(["Unnamed: 0", "강의개요"], axis=1, inplace=True)
df = df.reindex(columns = ["대분류", "학정번호", "강의명", "교수명", "강의개요한국어", "유의사항"])
df = df.rename(columns = {"강의개요한국어":"강의개요"})
df.set_index("학정번호", inplace=True)

In [None]:
tf_idf.index = df.index
tf_idf.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,구약성경,사그라든,컨,나누어,검열,구직,벗어나지,격려,레파,...,라이센스,감도,재미있어야,장단점,정수,매입,서양음악사,길,시가,정시
학정번호,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
YCA1007-01-00,2.870138,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
YCA1009-01-00,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
YCA1101-12-00,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
YCA1102-14-00,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
YCA1103-08-00,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ATM3106-01-00,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ATM3107-01-00,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ATM4102-01-00,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ATM4103-01-00,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df_ = df.loc[:, ["대분류", "강의명", "교수명"]]
df_tfidf = pd.concat([df_, tf_idf], axis=1)
df_tfidf.head()

In [None]:
keyword = str(input("원하는 수업 내용? >>>"))   # 원하는 키워드 입력

try:
    return_df = df_tfidf[df_tfidf[keyword] > 0]     # 해당 keyword를 포함한 모든 강의 추출
    return_df = return_df.sort_values(keyword, ascending=False)  # tf-idf 값이 큰 강의 순으로 출력
    return_df = return_df.loc[:, ["대분류", "강의명", "교수명", keyword]]
    print("해당 내용을 포함한 수업: ")
    print(return_df)

except:
    print("다른 단어를 입력해보세요.")

원하는 수업 내용? >>>미분
해당 내용을 포함한 수업: 
                          대분류                강의명      교수명         미분
학정번호                                                                
MAT1001-10-01        언더우드국제대학      미분적분학과벡터해석(1)  베일리존앤드류  14.748395
MAT2016-06-01            공과대학            공학수학(3)      박동훈  14.748395
BIZ2114-01-00            경영대학            경영수학(2)      서승범  14.748395
ECO1101-04-00  국제캠퍼스(2019학번~)            경제수학(1)      서보윤   9.832264
ECO1101-03-00            상경대학            경제수학(1)      이진현   9.832264
MAT3114-02-01            공과대학               수치해석      윤명호   4.916132
ECO1101-02-00            상경대학            경제수학(1)      문명인   4.916132
MAT2102-01-00            이과대학            선형대수(1)      고형준   4.916132
MAT3104-01-00            이과대학             해석학(1)      김세익   4.916132
STA1002-03-00            상경대학              미분적분학      이명숙   4.916132
ECO1101-04-00            상경대학            경제수학(1)      서보윤   4.916132
BIZ2113-01-00            경영대학            경영수학(1)      서승범   4.916132
B