# 전처리

In [1]:
import re
import pickle
import pandas as pd

## 소속 코드 전처리

In [2]:
with open('category_codes.pickle','rb') as f:
    category_codes = pickle.load(f)

codes = category_codes.values()

In [3]:
code_book=pd.DataFrame()
for cd in list(codes):
    temp = pd.DataFrame(cd).T
    code_book = pd.concat([code_book,temp])
code_book=code_book.reset_index()
code_book.columns=['code','name','upper']

In [4]:
def find_major(x):
    if x not in code_book['upper'].unique():
        return '전공'
    else :
        return '미정'

In [5]:
code_book['assign'] = code_book['code'].apply(find_major)
code_book.loc[code_book['upper']=='없음','assign']='단과대'

## 강의정보 데이터 전처리

In [81]:
# DF 전처리 (과목별 GROUP)

df = pd.read_pickle('lecture_raw.csv')
df_sum = df.groupby(['과목명','교수명'])['과목분류'].unique().reset_index()

cols = list(df.columns[3:])

for c in cols:
    df_sum2 = df.groupby(['과목명','교수명'])[c].unique().reset_index()
    df_sum = pd.merge(df_sum,df_sum2) 

In [82]:
# 학부,학과 단위로 Category 설정
college = list(code_book.query('assign=="단과대"')['code'])
df_sum['소속코드'] = df_sum['소속코드'].apply(lambda x:list(set(x)-set(college)))

In [83]:
# 최근값으로 대치, 결측값 '정보없음'으로 통일

df_sum['과목코드'] = df_sum['과목코드'].apply(lambda x:x[-1])
df_sum['평점'] = df_sum['평점'].apply(lambda x:x[-1])
df_sum['시간'] = df_sum['시간'].apply(lambda x:x[-1])
df_sum['학점'] = df_sum['학점'].apply(lambda x:x[-1])
df_sum['비고'] = df_sum['비고'].apply(lambda x:x[-1])
df_sum['링크'] = df_sum['링크'].apply(lambda x:x[-1])

df_sum['비고'].fillna('정보없음',inplace=True)
df_sum['시간'].fillna('정보없음',inplace=True)

In [84]:
# 소속코드 -> 소속이름 으로 대치
def get_college(code):
    string = code_book.query('code==@code')['name'].values
    return string

In [85]:
df_sum['소속코드'] = df_sum['소속코드'].apply(lambda x: ((get_college(x))[-1]))

In [86]:
df_sum.to_pickle('lecture_processed.pickle')

## 이모지 코드 생성

In [87]:
def get_emoji(name):
    e = ""
    if re.search('문학',name) != None:
        e = '<i class="fas fa-pen-nib"></i>'
    elif re.search('건축',name) != None:
        e = '<i class="fas fa-drafting-compass"></i>'
    elif re.search('경제',name) != None :
        e = '<i class="fas fa-coins"></i>'
    elif re.search('디자인|미술|공예',name) != None :
        e = '<i class="fas fa-palette"></i>'
    elif re.search('경영',name) != None :
        e = '<i class="fas fa-user-tie"></i>'
    elif re.search('법|사회|교육',name) != None :
        e = '<i class="fas fa-balance-scale"></i>'
    elif re.search('공학|나노|전자|물리|화학|수학',name) != None:
        e = '<i class="fas fa-wrench"></i>'
    elif re.search('행정',name) != None:
        e = '<i class="fas fa-book-open"></i>'
    elif re.search('체육|스포츠',name) != None:
        e = '<i class="fas fa-swimmer"></i>'
    elif re.search('소프트',name) != None:
        e = '<i class="fas fa-code"></i>'
    elif re.search('KMU|국제|정치|유라시아|일본|중국',name) != None:
        e = '<i class="fas fa-globe-americas"></i>'
    elif re.search('사학',name) != None:
        e = '<i class="fas fa-scroll-old"></i>'
    elif re.search('언론|정치',name) != None:
        e = '<i class="fas fa-newspaper"></i>'
    elif re.search('공연',name) != None:
        e = '<i class="fas fa-theater-masks"></i>'
    elif re.search('자동차',name) != None:
        e = '<i class="fas fa-car-side"></i>'
    elif re.search('회계',name) != None:
        e = '<i class="fas fa-file-invoice-dollar"></i>'
    elif re.search('음악',name) != None :
        e = '<i class="fas fa-music"></i>'
    elif re.search('발효|바이오|산림|식품',name) != None:
        e = '<i class="fas fa-biohazard"></i>'
    else :
        e = '<i class="fas fa-university"></i>'
    
    
    return e
        
        

In [13]:
emoji = pd.DataFrame(df_sum['소속코드'].unique(),columns=['name'])
emoji['url'] = emoji['name'].apply(get_emoji)

emoji_export={}
for n,u in zip(emoji['name'],emoji['url']):
    emoji_export[n]=u

In [14]:
import json
with open('emoji.json', 'w', encoding = 'utf-8') as f:
    json.dump(emoji_export, f , ensure_ascii=False)

# 데이터 가져오기

In [88]:
import pandas as pd
import json
import pickle

In [89]:
lec_sim = pd.read_pickle('lec_sim.pkl')
prof_sim = pd.read_pickle('prof_sim.pkl')
df_sum = pd.read_pickle('lecture_processed.pickle')
with open('details.pickle','rb') as f:
    details = pickle.load(f)

In [90]:
review_df = pd.read_pickle('review_0213.csv')
review_df.dropna(axis=0,inplace=True)

In [91]:
lst=[]
for n,f in zip(review_df['name'],review_df['prof']):
    lst.append('-'.join([n,f]))

review_df['key'] = lst

review_df['senti'] = review_df['score'].apply(lambda x: int(x)>3)

lec_posi = review_df.groupby('key')['senti'].mean().reset_index()
prof_posi = review_df.groupby('prof')['senti'].mean().reset_index()

review_df['score'] = review_df['score'].apply(lambda x:int(x))

prof_score = review_df.groupby('prof')['score'].mean().reset_index()

In [92]:
prof_posi['senti2'] = 1 - prof_posi['senti']

In [93]:
prof_add = pd.merge(prof_posi,prof_score)

## 데이터 합치기

In [94]:
# 유사도 모듈 결과값이랑 형식 통일
keys=[]
for i,k in zip(df_sum['과목명'],df_sum['교수명']) :
    keys.append('-'.join([str(i),str(k)]))

### search_lecture 생성

In [95]:
df_sum['key'] = keys
df_export = pd.merge(df_sum,lec_sim,how='left')
df_export.loc[:,['sim','tags']] = df_export.loc[:,['sim','tags']].fillna('정보없음')

In [96]:
df_export.loc[df_export.query('교수명 ==""').index,'교수명']='None'

In [97]:
# detail 넣기 편하게 전처리 후 lecture 정보와 합치기
detail_to_export=[]
for i in details :
    n = i['name']
    p = i['prof']
    lst=[]
    for k in i['detail'].keys():
        lst.append(i['detail'][k])
    temp = {
        'name':n,
        'prof':p,
        '출석':lst[0],
        '성적비율':lst[1],
        '과제':lst[2],
        '팀플':lst[3],
        '시험횟수':lst[4]        
    }
    detail_to_export.append(temp)

df_details = pd.DataFrame.from_dict(detail_to_export)
df_details.replace('','정보없음',inplace=True)

temp=[]
for i,k in zip(df_details['name'],df_details['prof']):
    temp.append('-'.join([i,k]))
    
df_details['key']=temp
df_export = pd.merge(df_export,df_details, how = 'left')

In [98]:
# '[]' 형식 만들기 위한 함수
def process(x):
    if type(x) == str:
        return [x]
    else :
        return x

In [99]:
df_export['tags'] = df_export['tags'].apply(lambda x:process(x))

In [113]:
df_export = pd.merge(df_export,lec_posi,how='left')

df_export['senti2'] = 1-df_export['senti']

In [158]:
df_export.fillna(0,inplace=True)

In [194]:
df_export

Unnamed: 0,과목명,교수명,과목분류,과목코드,평점,소속코드,시간,학점,비고,링크,...,tags,name,prof,출석,성적비율,과제,팀플,시험횟수,senti,senti2
0,20세기패션,김수현,[전공선택],071211-01,0,의상디자인학과,"월 3(11:00-11:50) [조형관5층17호실],월 4(12:00-12:50) ...",2,타과생수강불가(다부전공자제외),https://everytime.kr/lecture/view/859697,...,[정보없음],20세기패션,김수현,정보없음,정보없음,정보없음,정보없음,정보없음,0.000000,0.000000
1,20세기패션,김의경,[전공선택],071211-01,0,의상디자인학과,"화 5(13:00-13:50) [조형관5층15-1호실],화 6(14:00-14:50...",2,타과생수강불가(다전공자제외),https://everytime.kr/lecture/view/547761,...,[정보없음],20세기패션,김의경,정보없음,정보없음,정보없음,정보없음,정보없음,0.000000,0.000000
2,20세기패션,이은정,[전공선택],071211-01,0,의상디자인학과,수 5(13:00-13:50) [조형관5층17호실]<br>수 6(14:00-14:5...,2,타과생수강불가(다부전공자제외),https://everytime.kr/lecture/view/1418316,...,[정보없음],20세기패션,이은정,정보없음,정보없음,정보없음,정보없음,정보없음,0.000000,0.000000
3,21세기창업과벤처(공학인증),조봉환,[교양선택],040396-01,0,정치·경제·사회·세계,"수 E(15:00-16:15) [경영관1층9호실],수 F(16:30-17:45) [...",3,,https://everytime.kr/lecture/view/546056,...,[정보없음],21세기창업과벤처(공학인증),조봉환,정보없음,정보없음,정보없음,정보없음,정보없음,0.000000,0.000000
4,21세기트렌드와정책이슈,김진동,[교양선택],017163-91,0,정치·경제·사회·세계,"토 0(08:00-08:50) [북악관6층9호실],토 1(09:00-09:50) [...",2,"행정관리학과(계약학과), 일반학생 수강불가",https://everytime.kr/lecture/view/542422,...,[정보없음],21세기트렌드와정책이슈,김진동,정보없음,정보없음,정보없음,정보없음,정보없음,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13478,휴먼테크놀로지캡스톤디자인II,이건상,[전공선택],114840-01,0,기계공학부 융합기계공학전공,금 6(14:00-14:50) [글로벌센터지하1층5호실]<br>금 7(15:00-1...,3,,https://everytime.kr/lecture/view/1417987,...,[정보없음],휴먼테크놀로지캡스톤디자인II,이건상,정보없음,정보없음,정보없음,정보없음,정보없음,0.000000,0.000000
13479,휴먼테크놀로지캡스톤디자인II,임시형,[전공선택],114840-01,0,기계시스템공학부 융합기계공학전공(2017년폐지),화 6(14:00-14:50) [산학협력관지하1층3호실]<br>화 7(15:00-1...,3,,https://everytime.kr/lecture/view/2023123,...,[정보없음],휴먼테크놀로지캡스톤디자인II,임시형,정보없음,정보없음,정보없음,정보없음,정보없음,0.000000,0.000000
13480,희곡론,서재길,[전공선택],000431-01,3.18,한국어문학부 국어국문학전공,화 B(10:30-11:45) [북악관11층11호실]<br>목 B(10:30-11:...,3,,https://everytime.kr/lecture/view/541677,...,"[연극, 쪽지, 발표, 퀴즈, 기준]",희곡론,서재길,직접호명,비율 채워줌,많음,보통,네 번 이상,0.454545,0.545455
13481,희곡론,이화진,[전공선택],000431-01,0,국어국문학과,화 C(12:00-13:15) [북악관3층8호실]<br>화 D(13:30-14:45...,3,,https://everytime.kr/lecture/view/1081747,...,[정보없음],희곡론,이화진,정보없음,정보없음,정보없음,정보없음,정보없음,0.000000,0.000000


In [159]:
#search_lecture 모델생성
search_lecture=[]
for vals in df_export.values:
    temp = {
        "model" : "search.lecture",
        "fields" : {
            'name' : vals[0],
            'prof' : [vals[1]],
            'prof_name' : vals[1],
            'class_type' : str(vals[2]),
            'class_code' : vals[3],
            'score' : vals[4],
            'category' : vals[5],
            'icon' : [vals[5]],
            'time' : vals[6],
            'recommend_year' : vals[7],
            'remarks' : vals[8],
            'link' : vals[9],
            'semester' : str(vals[10]),
            'hash_tags' : str(vals[13]),
            
            'attendance' : str(vals[16]),
            'score_stlict' : str(vals[17]),
            'homework' : str(vals[18]),
            'teamplay' : str(vals[19]),
            'test_time': str(vals[20]),
            'positive' : vals[21],
            'negative' : vals[22]
        }
    }
    search_lecture.append(temp)

### search_prof 생성

In [174]:
# 교수의 대표값(평균적인 강의 방식) 찾아주기
cols = ['성적비율','과제','팀플','시험횟수']
prof_details = df_details.query('출석 !="정보없음"')

# 최빈값 찾기 위해 list화
prof_details.iloc[:,2:7] = prof_details.iloc[:,2:7].applymap(lambda x:[x])
prof_mode = prof_details.groupby('prof')['출석'].sum().apply(lambda x:max(set(x), key=x.count)).reset_index()
# 최빈값으로 summary해서 저장
for i in cols:
    prof_mode = pd.merge(prof_mode,prof_details.groupby('prof')[i].sum().apply(lambda x:max(set(x), key=x.count)).reset_index(),how='right')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [168]:
for_assign = df_export.copy()
for_assign['소속코드'] = for_assign['소속코드'].apply(process)

In [175]:
# 교수 소속 정보 얻기
prof_assign = for_assign.groupby('교수명')['소속코드'].sum().apply(lambda x:max(set(x), key=x.count)).reset_index() 
# 교수 정보들 합치기
prof_summary = pd.merge(prof_assign,prof_mode,how='left',left_on = '교수명', right_on ='prof')
# 결측치 대치
prof_summary.fillna('정보없음',inplace=True)

In [176]:
# 유사도와 합치기(tag 얻기 위함)
prof = pd.merge(prof_summary,prof_sim[['key','tags']],how='left',left_on='prof',right_on='key')
prof.drop('key',axis=1,inplace=True)
# 결측치 대치
prof['tags'] = prof['tags'].fillna('정보없음').apply(lambda x:process(x))

In [177]:
prof = pd.merge(prof,prof_add,how='left')

In [205]:
prof.columns

Index(['교수명', '소속코드', 'prof', '출석', '성적비율', '과제', '팀플', '시험횟수', 'tags',
       'senti', 'senti2', 'score'],
      dtype='object')

In [210]:
#search_prof 모델 생성
search_prof=[]
for i in prof.values:
    temp = {
        "model": "search.prof",
        "fields": {
            'name': i[0],
            'category' : i[1],
            'icon' : [i[1]],
            'hash_tags' : i[8],
            'attendance': i[3],
            'score_stlict': i[4],
            'homework' : i[5],
            'teamplay' : i[6],
            'test_time': i[7],
            'positive' : i[9],
            'negative' : i[10],
            'score' : i[11]
        }
        }
    search_prof.append(temp)

In [211]:
search_prof

[{'model': 'search.prof',
  'fields': {'name': 'Andrew P. Ippoliti',
   'category': '도자공예학과',
   'icon': ['도자공예학과'],
   'hash_tags': ['정보없음'],
   'attendance': '정보없음',
   'score_stlict': '정보없음',
   'homework': '정보없음',
   'teamplay': '정보없음',
   'test_time': '정보없음',
   'positive': 0.0,
   'negative': 0.0,
   'score': 0.0}},
 {'model': 'search.prof',
  'fields': {'name': 'Luo Tao',
   'category': '시각디자인학과',
   'icon': ['시각디자인학과'],
   'hash_tags': ['선택', '폭탄', '유의', '디자인', '진로'],
   'attendance': '반영안함',
   'score_stlict': '비율 채워줌',
   'homework': '많음',
   'teamplay': '많음',
   'test_time': '없음',
   'positive': 1.0,
   'negative': 0.0,
   'score': 5.0}},
 {'model': 'search.prof',
  'fields': {'name': 'None',
   'category': '학과미배정',
   'icon': ['학과미배정'],
   'hash_tags': ['정보없음'],
   'attendance': '정보없음',
   'score_stlict': '정보없음',
   'homework': '정보없음',
   'teamplay': '정보없음',
   'test_time': '정보없음',
   'positive': 0.0,
   'negative': 0.0,
   'score': 0.0}},
 {'model': 'search.prof',
  'field

## search_similarlecture 모델 생성

In [188]:
# search_similarlecture 모델 생성
pk = 1
search_similarlecture=[]
for ks,ss in zip(lec_sim['key'],lec_sim['sim']):
    k = ks.split('-')
    
    if len(k) <= 2 :
        temp_name = k[0]
        temp_prof = k[1]
        
    elif len(k) > 2 :
        temp_name = ('-'.join(k[:-1]))
        temp_prof = (k[-1])
        
    for sr in ss:
        s = sr.split('-')
        if len(s) <= 2 :
            temp_name_sim = s[0]
            temp_prof_sim = s[1]
        
        elif len(s) > 2 :
            temp_name_sim = ('-'.join(s[:-1]))
            temp_prof_sim = (s[-1])
        
        temp={
            "model": "search.similarlecture",
            "pk": pk,
            "fields": {
            "similar_from": [
                temp_name,
                temp_prof
            ],
            "similar_to": [
                temp_name_sim,
                temp_prof_sim
            ]
            }
            }
        search_similarlecture.append(temp)
        pk += 1

## search_similarprof 모델 생성


In [189]:
# search_similarprof 모델 생성
search_similarprof=[]
pk = 1
for k,ss in zip(prof_sim['key'],prof_sim['sim']):
    for s in ss:
        temp={
          "model": "search.similarprof",
          "pk": pk,
          "fields": {
            "similar_from": [
              k
            ],
            "similar_to": [
              s
            ]
          }
        }
        search_similarprof.append(temp)
        pk += 1

In [190]:
with open('search_lecture.json', 'w', encoding = 'utf-8') as f:
    json.dump(search_lecture, f , ensure_ascii=False)

In [212]:
with open('search_prof.json', 'w', encoding = 'utf-8') as f:
    json.dump(search_prof, f , ensure_ascii=False)

In [192]:
with open('search_similarlecture.json', 'w', encoding = 'utf-8') as f:
    json.dump(search_similarlecture, f , ensure_ascii=False)

In [193]:
with open('search_similarprof.json', 'w', encoding = 'utf-8') as f:
    json.dump(search_similarprof, f , ensure_ascii=False)