In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression

In [2]:
df = sns.load_dataset('titanic')

In [3]:
df.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True


In [4]:
df.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [5]:
df['age'].fillna(df['age'].mean(), inplace=True)
df['embarked'].fillna(df['embarked'].mode().values[0], inplace=True)
df['embark_town'].fillna(df['embark_town'].mode().values[0], inplace=True)
df.drop(columns=['deck'], inplace=True)

In [6]:
df.isnull().sum()

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
embark_town    0
alive          0
alone          0
dtype: int64

In [10]:
df_cat = df.select_dtypes(exclude=np.number)

In [11]:
encoder = LabelEncoder()

for col in df_cat.columns:
    df_cat[col] = encoder.fit_transform(df_cat[col])
df[df_cat.columns] = df_cat

In [12]:
train, test = train_test_split(df, random_state=42)
train.shape

(668, 14)

In [23]:
train_y = train.survived
train_x = train.copy()
train_x.drop(columns=['survived'], inplace=True)

valid = test.survived
test.drop(columns=['survived'], inplace=True)

In [34]:
train_x.columns

Index(['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'class',
       'who', 'adult_male', 'embark_town', 'alive', 'alone'],
      dtype='object')

In [39]:
train_x2 = train_x[['pclass', 'sex']]
test2 = test[['pclass', 'sex']]

In [40]:
model = RandomForestClassifier(random_state=42)

model.fit(train_x2, train_y)

pred = model.predict(test2)

print("정확도 : ", f1_score(pred, valid))
print("정확도 : ", roc_auc_score(pred, valid))

정확도 :  0.6474820143884892
정확도 :  0.8228323699421966


In [25]:
model = LogisticRegression()

model.fit(train_x, train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [26]:
train_x.columns

Index(['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'class',
       'who', 'adult_male', 'embark_town', 'alive', 'alone'],
      dtype='object')

In [27]:
parch_coefficient = model.coef_[0][4]
parch_coefficient

-0.21377576128171016

In [28]:
df.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,1,22.0,1,0,7.25,2,2,1,1,2,0,0
1,1,1,0,38.0,1,0,71.2833,0,0,2,0,0,1,0
2,1,3,0,26.0,0,0,7.925,2,2,2,0,2,1,1


In [33]:
from sklearn.preprocessing import MinMaxScaler

encoder = MinMaxScaler()

data_age = pd.DataFrame(df.age)

data_sc = encoder.fit_transform(data_age)


print(len(data_sc[data_sc>=0.5]))

150


In [41]:
df['fare']

0       7.2500
1      71.2833
2       7.9250
3      53.1000
4       8.0500
        ...   
886    13.0000
887    30.0000
888    23.4500
889    30.0000
890     7.7500
Name: fare, Length: 891, dtype: float64

In [46]:
df.corr()['survived'].sort_values()

adult_male    -0.557080
sex           -0.543351
class         -0.338481
pclass        -0.338481
alone         -0.203367
embarked      -0.167675
embark_town   -0.167675
age           -0.069809
sibsp         -0.035322
parch          0.081629
fare           0.257307
who            0.325753
survived       1.000000
alive          1.000000
Name: survived, dtype: float64

In [48]:
df.describe()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,0.647587,29.699118,0.523008,0.381594,32.204208,1.536476,1.308642,1.210999,0.602694,1.536476,0.383838,0.602694
std,0.486592,0.836071,0.47799,13.002015,1.102743,0.806057,49.693429,0.791503,0.836071,0.594291,0.489615,0.791503,0.486592,0.489615
min,0.0,1.0,0.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,2.0,0.0,22.0,0.0,0.0,7.9104,1.0,1.0,1.0,0.0,1.0,0.0,0.0
50%,0.0,3.0,1.0,29.699118,0.0,0.0,14.4542,2.0,2.0,1.0,1.0,2.0,0.0,1.0
75%,1.0,3.0,1.0,35.0,1.0,0.0,31.0,2.0,2.0,2.0,1.0,2.0,1.0,1.0
max,1.0,3.0,1.0,80.0,8.0,6.0,512.3292,2.0,2.0,2.0,1.0,2.0,1.0,1.0


In [51]:
f1 = df['fare'].quantile(1/4)
f3 = df['fare'].quantile(3/4)
iqr= f3 - f1

min = f1 - 1.5*iqr
max = f3 + 1.5*iqr

print(len(df[(df['fare']<min) | (df['fare']>max)]))

116


In [52]:
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/krdatacertificate/e7_p1.csv')
df.head(5)

Unnamed: 0,학생,국어,수학,영어,과학
0,ID_0,63.0,,79.0,84.0
1,ID_1,91.0,93.0,,73.0
2,ID_2,59.0,55.0,,56.0
3,ID_3,71.0,83.0,82.0,
4,ID_4,62.0,72.0,56.0,


In [53]:
df.isnull().sum()

학생    0
국어    2
수학    3
영어    4
과학    6
dtype: int64

In [54]:
from scipy.stats import zscore
sub = '국어'
result = zscore(df[sub].dropna()).max()

In [55]:
result

1.713855688712825

In [56]:
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/krdatacertificate/e7_p2_.csv')
df.head(5)

Unnamed: 0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,var_10,...,var_23,var_24,var_25,var_26,var_27,var_28,var_29,var_30,var_31,var_32
0,-0.335864,-2.425701,0.452851,-1.381924,-0.425591,-1.461498,-0.417082,-0.944226,-0.351224,1.474046,...,1.715714,0.300227,0.439828,-0.094772,1.551995,3.308446,-0.391377,1.963829,-1.458605,-0.348652
1,0.803535,-0.699433,0.095579,0.788053,-0.267674,-0.370746,0.636034,-1.674584,-0.058465,-1.188973,...,-1.348879,-0.89948,-1.271886,0.333734,-0.373565,-2.091508,-0.715245,-0.688441,0.910927,-0.746899
2,-0.40847,-0.361105,-0.857278,0.338294,2.317821,-0.099928,0.557053,-0.847932,0.973471,0.101203,...,-0.388353,0.918553,-1.984422,0.385055,1.161414,-1.280253,2.831802,-0.953771,-0.228466,0.766254
3,-0.690599,0.777928,-0.079963,0.164086,1.721702,0.146761,-0.988636,0.885344,0.647053,0.674609,...,0.337074,-1.406224,-0.699728,-0.836068,-0.226315,1.260893,-0.78163,-0.219995,0.871289,0.017578
4,0.179819,-0.668267,0.381849,-0.884196,0.758519,-1.02619,0.986227,-1.663167,2.290551,0.564845,...,-0.700137,-0.153825,-1.835313,0.182131,0.890247,0.024791,1.489955,1.276357,-0.285144,1.066486


In [63]:
sub = df.corr()['var_11'].abs().sort_values().index[-2]
ans = df[sub].mean()
print(ans)

-0.06289356546077182


In [64]:
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/krdatacertificate/e7_p3.csv')
df.head(5)

Unnamed: 0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,var_10,var_11,var_12
0,-0.401584,0.007198,-0.13491,-0.419514,-0.686331,-1.17455,-0.49915,-0.55751,0.357347,0.844209,0.082097,0.475098
1,1.064622,-0.531247,0.571185,-0.762243,-1.456043,0.355432,0.88001,0.523183,-0.141733,0.301472,0.840299,-0.045113
2,0.900195,-1.148496,-0.718437,-2.315266,-0.755505,0.630923,0.586027,0.086497,-0.381657,-1.602532,1.56634,-0.654105
3,1.199227,-0.1142,0.101536,-1.851971,-0.672844,0.796077,0.644246,-0.836296,-1.707074,0.376281,1.303967,1.228015
4,0.607393,1.675917,1.063759,-0.799493,-0.959568,-0.103007,-0.245984,-0.059492,0.293592,-0.517044,0.100772,0.403833


In [66]:
q1 = df['var_6'].quantile(1/4)
q3 = df['var_6'].quantile(3/4)
iqr = q3 - q1
min = q1 - 1.5*iqr
max = q3 + 1.5*iqr

print(len(df[(df['var_6']>min) | (df['var_6']<max)]['var_6']))

523


In [73]:
len(df[(df['var_6']<min) | (df['var_6']>max)]['var_6'])

8

In [74]:
import pandas as pd
train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/krdatacertificate/e7_p2_train2.csv')
test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/krdatacertificate/e7_p2_test2.csv')

display(train.head(2))
test.head(2)

Unnamed: 0,ID,연월,업종명,이용자구분,성별,이용자수,이용건수,이용금액
0,ID_5020,202201,여관업,법인,알수없음,7693,12105,3049021809
1,ID_5021,202201,여관업,제주도민,남성,3990,4291,294163241


Unnamed: 0,ID,연월,업종명,이용자구분,성별,이용자수,이용건수
0,ID_2575,201911,한식 음식점업,제주도민,여성,153363,163462
1,ID_6637,202305,건강보조식품 소매업,법인,알수없음,136,144


In [75]:
train.columns, test.columns

(Index(['ID', '연월', '업종명', '이용자구분', '성별', '이용자수', '이용건수', '이용금액'], dtype='object'),
 Index(['ID', '연월', '업종명', '이용자구분', '성별', '이용자수', '이용건수'], dtype='object'))

In [76]:
train_cat = train.select_dtypes(exclude=np.number)

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

for col in train_cat.columns:
    train_cat[col] = encoder.fit_transform(train_cat[col])
train[train_cat.columns] = train_cat

In [78]:
test_cat = test.select_dtypes(exclude=np.number)

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

for col in test_cat.columns:
    test_cat[col] = encoder.fit_transform(test_cat[col])
test[test_cat.columns] = test_cat

In [77]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2245 entries, 0 to 2244
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   ID      2245 non-null   int32
 1   연월      2245 non-null   int64
 2   업종명     2245 non-null   int32
 3   이용자구분   2245 non-null   int32
 4   성별      2245 non-null   int32
 5   이용자수    2245 non-null   int64
 6   이용건수    2245 non-null   int64
 7   이용금액    2245 non-null   int64
dtypes: int32(4), int64(4)
memory usage: 105.4 KB


In [79]:
from sklearn.model_selection import train_test_split

df = train.copy()

train, valid = train_test_split(df, random_state=42)

train_y = train['이용금액']
train_x = train.copy()
train_x.drop(columns=['이용금액'], inplace=True)

valid_y = valid['이용금액']
valid_x = valid.copy()
valid_x.drop(columns=['이용금액'], inplace=True)

In [85]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()

model.fit(train_x, train_y)

pred = model.predict(valid_x)

In [87]:
from sklearn.metrics import mean_squared_error

print("예측 오차 : ", np.sqrt(mean_squared_error(pred, valid_y)))

예측 오차 :  217519611.69356036


In [89]:
df_y = df['이용금액']
df_x = df.copy()
df_x.drop(columns=['이용금액'], inplace=True)

model = RandomForestRegressor()
model.fit(df_x, df_y)
pred = model.predict(test)

In [90]:
pred

array([9.02322289e+09, 1.17042282e+08, 4.41245541e+06, ...,
       5.34667109e+09, 9.39297517e+08, 8.98986411e+08])

In [91]:
submission = pd.DataFrame()
submission['ID'] = test['ID']
submission['이용금액'] = pred
submission.head(3)

Unnamed: 0,ID,이용금액
0,845,9023223000.0
1,3151,117042300.0
2,2611,4412455.0


In [94]:
# submission.to_csv('test.csv', encoding='utf-8', index=False)

In [95]:
import pandas as pd
df=pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/krdatacertificate/e7_p3_1.csv')
df.head()

Unnamed: 0,Target,v1,v2,v3,v4,v5,v6,v7,v8,v9,...,v12,v13,v14,v15,v16,v17,v18,v19,v20,v21
0,21.650072,0.496714,-0.138264,0.647689,1.52303,-0.234153,-0.234137,1.579213,0.767435,-0.469474,...,-0.46573,0.241962,-1.91328,-1.724918,-0.562288,-1.012831,0.314247,-0.908024,-1.412304,1.465649
1,-19.469855,-0.225776,0.067528,-1.424748,-0.544383,0.110923,-1.150994,0.375698,-0.600639,-0.291694,...,-0.013497,-1.057711,0.822545,-1.220844,0.208864,-1.95967,-1.328186,0.196861,0.738467,0.171368
2,-24.4756,-0.115648,-0.301104,-1.478522,-0.719844,-0.460639,1.057122,0.343618,-1.76304,0.324084,...,0.611676,1.031,0.93128,-0.839218,-0.309212,0.331263,0.975545,-0.479174,-0.185659,-1.106335
3,13.476831,-1.196207,0.812526,1.35624,-0.07201,1.003533,0.361636,-0.64512,0.361396,1.538037,...,-2.619745,0.821903,0.087047,-0.299007,0.091761,-1.987569,-0.219672,0.357113,1.477894,-0.51827
4,3.505322,-0.808494,-0.501757,0.915402,0.328751,-0.52976,0.513267,0.097078,0.968645,-0.702053,...,-1.463515,0.29612,0.261055,0.005113,-0.234587,-1.415371,-0.420645,-0.342715,-0.802277,-0.161286


In [99]:
ans = df.corr()['Target'].sort_values().values[-2]
print(ans)

0.6270251925517436


In [106]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

x = df.copy().drop(columns='Target')
y = df['Target']

model.fit(x,y)


# 'v2' 변수의 인덱스 찾기
v2_index = x.columns.get_loc('v2')

# 'v2' 변수의 회귀 계수 출력
print(f"'v2' 변수의 회귀 계수: {model.coef_[v2_index]}")

'v2' 변수의 회귀 계수: 6.440301364843063


In [107]:
ans = model.pvalues.max()
print(ans)

AttributeError: 'LinearRegression' object has no attribute 'pvalues'

In [None]:
import statsmodels.api as sm