In [None]:
# colab_notebooks/model_training.ipynb

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from joblib import dump
import statsmodels.api as sm

# 데이터 로드
data_path = '/content/drive/MyDrive/stock_analysis_project/data/finviz_sentiment_analysis_results.csv'
stock_data_path = '/content/drive/MyDrive/stock_analysis_project/data/stock_prices.csv'
sentiment_data = pd.read_csv(data_path, index_col='Date', parse_dates=True)
stock_data = pd.read_csv(stock_data_path, parse_dates=['Date'])

# 데이터 병합
merged_data = pd.merge(sentiment_data, stock_data, left_index=True, right_index=True, how='inner')

# 감성 점수를 숫자로 변환
merged_data['Sentiment_Score'] = merged_data['Predicted Sentiment'].map({'positive': 1, 'negative': -1, 'neutral': 0})

# 수익률 계산
merged_data['Return'] = merged_data.groupby('Company')['Price'].pct_change().shift(-1)
merged_data.dropna(inplace=True)

# 회귀 분석 수행
def perform_regression(merged_df):
    # 독립 변수와 종속 변수 설정
    X = merged_df[['Sentiment_Score']]
    y = merged_df['Return']

    # 상수 추가
    X = sm.add_constant(X)

    # 회귀 분석
    model = sm.OLS(y, X).fit()
    print(model.summary())
    return model

regression_model = perform_regression(merged_data)

# 특성 및 타겟 설정
features = ['open', 'high', 'low', 'volume', 'Sentiment_Score', 'change_percent', 'ma5', 'rsi']
target = 'Return'
X = merged_data[features]
y = (merged_data[target] > 0).astype(int)

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 모델 학습
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 모델 저장
model_path = '/content/drive/MyDrive/stock_analysis_project/models/trained_model.joblib'
encoder_path = '/content/drive/MyDrive/stock_analysis_project/models/company_encoder.joblib'
dump(model, model_path)
dump(OneHotEncoder(handle_unknown='ignore').fit(merged_data[['Company']]), encoder_path)
