# 필요한 라이브러리 import

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import SQLContext
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
import logging
import os
from json import loads
import ast
import csv

# 변수들을 float으로 변환시키는 함수

In [2]:
def convert_to_float( df ):
    cols = ['tkl','tkl_suc','header_att','header_suc','drib_per_g','drib','pass_att','pass_suc','CC','Int','Dist_covered','Cr_att','Cr_suc','shot','shot_on_t','gen_point']
    for col_name in cols:
        df = df.withColumn(col_name, col(col_name).cast('float'))
    return df

# 선형회귀를 진행하고 도출된 가중치와 절편을 return 하는 함수

In [3]:
def linear_regression( df ):

    # 7:3 train test split
    splits = df.randomSplit([0.7,0.3])

    train_df = splits[0]
    test_df = splits[1]

    lr = LinearRegression(featuresCol = 'features', labelCol='gen_point', maxIter=1000, regParam=0.3, elasticNetParam=0.8)
    lr_model = lr.fit(train_df)

    # 가중치가 str으로 나오므로 ast를 이용해서 리스트로 변경
    coef_inter_list = ast.literal_eval(str(lr_model.coefficients))
    coef_inter_list.append(str(lr_model.intercept))

    return coef_inter_list

# Spark를 사용해서 가중치 절편 구하는 함수

In [4]:
def cal_weight( data_path , weight_path , season):
    # 스파크 세션 초기화
    spark = SparkSession.builder.appName('myapp') \
                        .master('local[*]').getOrCreate()

    # 포지션 판정이 된 데이터를 불러오기
    total_path = data_path+'/'+season
    file_list = os.listdir(total_path)

    for file in file_list:

        # pyspark를 이용해서 csv파일 불러오기
        df = spark.read.option('header',True).csv(total_path+'/'+file)

        #df.printSchema()

        # 포지션 별로 spark_df를 분리
        forward_df = df.filter(df.position == '공격수')
        defender_df = df.filter(df.position == '수비수')
        midfielder_df = df.filter(df.position == '미드필더')

        # 각 df에서 position 부분을 제외
        forward_df = forward_df.drop(forward_df.position)
        defender_df = defender_df.drop(defender_df.position)
        midfielder_df = midfielder_df.drop(midfielder_df.position)

        # 모든 변수를 float로 변환
        forward_df = convert_to_float(forward_df)
        defender_df = convert_to_float(defender_df)
        midfielder_df = convert_to_float(midfielder_df)


        # 각 포지션 별 df에 사용할 column을 정의
        forward_col = ['pass_att','pass_suc','CC','drib_per_g','drib','Cr_att','Cr_suc','shot','shot_on_t']
        defender_col = ['tkl','tkl_suc','header_att','header_suc','pass_att','pass_suc','Int']
        midfielder_col = ['tkl','tkl_suc','pass_att','pass_suc','Dist_covered','CC','drib_per_g','drib','Cr_att','Cr_suc']

        # 변수를 inputCols로 하고 지도학습을 위한 결과를 outputCol로 하는 VectorAssembler객체를 생성
        forward_VA = VectorAssembler(inputCols = forward_col , outputCol = 'features')
        defender_VA = VectorAssembler(inputCols = defender_col , outputCol = 'features')
        midfielder_VA = VectorAssembler(inputCols = midfielder_col , outputCol = 'features')

        # VectorAssembler객체를 이용해서 데이터 변환
        forward_df = forward_VA.transform(forward_df)
        defender_df = defender_VA.transform(defender_df)
        midfielder_df = midfielder_VA.transform(midfielder_df)

        forward_df = forward_df.select(['features','gen_point'])
        defender_df = defender_df.select(['features','gen_point'])
        midfielder_df = midfielder_df.select(['features','gen_point'])

        forward_coef_inter_list = linear_regression(forward_df)
        defender_coef_inter_list = linear_regression(defender_df)
        midfielder_coef_inter_list = linear_regression(midfielder_df)

        # 가중치와 절편을 저장할 폴더 생성
        os.makedirs(weight_path+'/'+season,exist_ok=True)


        # csv 파일로 가중치와 절편을 저장
        with open(weight_path+'/'+season+'/forward_weights.csv', 'w', newline='',encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(['패스 시도', '패스 %','기회 창출/90','경기 당 드리블','드리블','Cr A', 'Cr C/A','슈팅 수', '유효 슈팅 %','절편'])
            writer.writerow(forward_coef_inter_list)

        with open(weight_path+'/'+season+'/midfielder_weights.csv', 'w', newline='',encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(['태클','태클 성공률','패스 시도', '패스 %','달린 거리/90분','기회 창출/90','경기 당 드리블','드리블','Cr A', 'Cr C/A','절편'])
            writer.writerow(midfielder_coef_inter_list)

        with open(weight_path+'/'+season+'/defender_weights.csv', 'w', newline='',encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(['태클','태클 성공률','헤더 시도','헤더 성공%','패스 시도', '패스 %','Int/90','절편'])
            writer.writerow(defender_coef_inter_list)

In [5]:
ml_data_path = './datas/FM_DATA_FOR_ML'

season = '2020-2021'

weight_path = './datas/FM_DATA_WEIGHT'

os.makedirs( weight_path+'/'+season,exist_ok=True)

cal_weight( ml_data_path  , weight_path , season)