# Document Similarity

## Analyse data case 05 - T-test

---

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline


In [None]:
from matplotlib import style
style.use('ggplot')

plt.rc('figure', figsize=(40, 20))

from matplotlib import rcParams
rcParams['font.size'] = 30
rcParams['axes.unicode_minus'] = False


### Define value

In [None]:
fname_result = 'data/result_score.pkl'

### Load contents file (pickle)

In [None]:
df_c = pd.read_pickle(fname_result)

In [None]:
df_c.columns

In [None]:
df_c.loc[df_c['class_type'] == 'B','class_type'] = '1_BEFORE'
df_c.loc[df_c['class_type'] == 'A','class_type'] = '2_AFTER'

### Normalization

In [None]:
col_list1 = list(df_c.columns[:5])
col_list2 = list(df_c.columns[5:])

In [None]:
MAX = df_c[col_list2].max().max()
MAX

In [None]:
MIN = df_c[col_list2].min().min()
MIN

In [None]:
def func(x):
    return (x - MIN) * 100 / (MAX - MIN)

In [None]:
df_n = pd.concat([df_c[col_list1], df_c[col_list2].applymap(func)], axis=1)

## 1. 기업(comp)별 집계

### 기업별 분류 컬럼 추가

In [None]:
df_n['company'] = df_n['fname'].str.split('_').apply(lambda x: x[0])

### 년도 분류 컬럼 추가

In [None]:
df_n['year'] = df_n['fname'].str.split(r'_|\(|\-|\.').apply(lambda x: x[1])

In [None]:
df_n

---

## 1. Goal별 성장률

### Goal별 점수(유사도 평균) 계산 - Before/After

In [None]:
gp_c = df_n.groupby(['class_type']).mean(numeric_only=True)
gp_c.drop('doc_id', axis=1, inplace=True)
gp_c = gp_c.T

In [None]:
gp_c

### Goal별 성장률 - Before --> After

In [None]:
gp_c['ratio'] = (gp_c['2_AFTER'] - gp_c['1_BEFORE']) * 100 / gp_c['1_BEFORE']
gp_c['ratio'] = gp_c['ratio'].round(2)

In [None]:
gp_c

---

## 2. Goal별 평균 T-test - Before/After

In [None]:
import scipy.stats as stats
import urllib
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

In [None]:
glist = ['goal01', 'goal02', 'goal03', 'goal04', 'goal05', 'goal06', 'goal07', 'goal08',
         'goal09', 'goal10', 'goal11', 'goal12', 'goal13', 'goal14', 'goal15', 'goal16', 'goal17']

### t 검정 함수 정의

In [None]:
#
# perform t-test NOT assuming equal variances
#

def run_ttest(group1, group2):
    return stats.ttest_ind(group1, group2, equal_var=False)
    

### Goal별 평균값 t 검정

In [None]:
# Create empty DataFrame
df = pd.DataFrame()

for gname in glist:
    g1 = df_n.loc[(df_n['class_type'] == '1_BEFORE'), gname].values
    g2 = df_n.loc[(df_n['class_type'] == '2_AFTER' ), gname].values
    
    tResult = run_ttest(g1, g2)
    #print(gname, tResult[0], tResult[1])
    
    df.loc[gname,'t-statistic'] = tResult[0]
    df.loc[gname,'p-value']     = tResult[1]
    

In [None]:
df

### Goal별 성장률, 평균값 t 검정

In [None]:
df_t = pd.concat([gp_c, df], axis=1)

In [None]:
df_t

---

In [None]:
# End of file