In [6]:
# TensorFlow 및 관련 라이브러리 임포트
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Conv2D, MaxPooling2D, Flatten, Dropout, Embedding, Bidirectional, GRU
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy, CategoricalCrossentropy
from tensorflow.keras.metrics import Accuracy, Precision, Recall

# 자연어처리를 위한 NLTK 라이브러리 임포트
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

# 자연어처리를 위한 SpaCy 라이브러리 임포트
import spacy
from spacy.tokens import Doc, Span, Token
# 모델 다운로드
!python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')

# 자연어처리를 위한 Gensim 라이브러리 임포트
import gensim
from gensim.models import Word2Vec, FastText
from gensim.corpora.dictionary import Dictionary
from gensim.models.phrases import Phrases, Phraser

# 기타 유용한 라이브러리 임포트
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
from collections import Counter
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, roc_auc_score, precision_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kosmo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kosmo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kosmo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.1/12.8 MB 648.1 kB/s eta 0:00:20
     --- ------------------------------------ 1.2/12.8 MB 9.6 MB/s eta 0:00:02
     -------------- ------------------------- 4.6/12.8 MB 26.9 MB/s eta 0:00:01
     ---------------------- ----------------- 7.2/12.8 MB 30.8 MB/s eta 0:00:01
     ------------------------------ --------- 9.7/12.8 MB 36.3 MB/s eta 0:00:01
     ------------------------------------ -- 12.1/12.8 MB 65.6 MB/s eta 0:00:01
     --------------------------------------  12.8/12.8 MB 73.1 MB/s eta 0:00:01
     --------------------------------------- 12.8

In [9]:
tf.keras.models.load_model

<function keras.saving.saving_api.load_model(filepath, custom_objects=None, compile=True, safe_mode=True, **kwargs)>

In [7]:
kaggle_test = pd.read_csv("../csvfile/kaggle_test/test.csv")

kaggle_test

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [8]:
kaggle_test = pd.get_dummies(kaggle_test)

kaggle_test

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1461,20,80.0,11622,5,6,1961,1961,0.0,468.0,...,False,False,False,True,False,False,False,False,True,False
1,1462,20,81.0,14267,6,6,1958,1958,108.0,923.0,...,False,False,False,True,False,False,False,False,True,False
2,1463,60,74.0,13830,5,5,1997,1998,0.0,791.0,...,False,False,False,True,False,False,False,False,True,False
3,1464,60,78.0,9978,6,6,1998,1998,20.0,602.0,...,False,False,False,True,False,False,False,False,True,False
4,1465,120,43.0,5005,8,5,1992,1992,0.0,263.0,...,False,False,False,True,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,21.0,1936,4,7,1970,1970,0.0,0.0,...,False,False,False,True,False,False,False,False,True,False
1455,2916,160,21.0,1894,4,5,1970,1970,0.0,252.0,...,False,False,False,True,True,False,False,False,False,False
1456,2917,20,160.0,20000,5,7,1960,1996,0.0,1224.0,...,False,False,False,True,True,False,False,False,False,False
1457,2918,85,62.0,10441,5,5,1992,1992,0.0,337.0,...,False,False,False,True,False,False,False,False,True,False


In [10]:
kaggle_test = kaggle_test.fillna(kaggle_test.mean())


kaggle_test

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1461,20,80.0,11622,5,6,1961,1961,0.0,468.0,...,False,False,False,True,False,False,False,False,True,False
1,1462,20,81.0,14267,6,6,1958,1958,108.0,923.0,...,False,False,False,True,False,False,False,False,True,False
2,1463,60,74.0,13830,5,5,1997,1998,0.0,791.0,...,False,False,False,True,False,False,False,False,True,False
3,1464,60,78.0,9978,6,6,1998,1998,20.0,602.0,...,False,False,False,True,False,False,False,False,True,False
4,1465,120,43.0,5005,8,5,1992,1992,0.0,263.0,...,False,False,False,True,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,21.0,1936,4,7,1970,1970,0.0,0.0,...,False,False,False,True,False,False,False,False,True,False
1455,2916,160,21.0,1894,4,5,1970,1970,0.0,252.0,...,False,False,False,True,True,False,False,False,False,False
1456,2917,20,160.0,20000,5,7,1960,1996,0.0,1224.0,...,False,False,False,True,True,False,False,False,False,False
1457,2918,85,62.0,10441,5,5,1992,1992,0.0,337.0,...,False,False,False,True,False,False,False,False,True,False


In [12]:
kaggle_test = kaggle_test.astype(int)


kaggle_test

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1461,20,80,11622,5,6,1961,1961,0,468,...,0,0,0,1,0,0,0,0,1,0
1,1462,20,81,14267,6,6,1958,1958,108,923,...,0,0,0,1,0,0,0,0,1,0
2,1463,60,74,13830,5,5,1997,1998,0,791,...,0,0,0,1,0,0,0,0,1,0
3,1464,60,78,9978,6,6,1998,1998,20,602,...,0,0,0,1,0,0,0,0,1,0
4,1465,120,43,5005,8,5,1992,1992,0,263,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,21,1936,4,7,1970,1970,0,0,...,0,0,0,1,0,0,0,0,1,0
1455,2916,160,21,1894,4,5,1970,1970,0,252,...,0,0,0,1,1,0,0,0,0,0
1456,2917,20,160,20000,5,7,1960,1996,0,1224,...,0,0,0,1,1,0,0,0,0,0
1457,2918,85,62,10441,5,5,1992,1992,0,337,...,0,0,0,1,0,0,0,0,1,0


In [14]:
#학습셋
kaggle_train = pd.read_csv("../csvfile/kaggle_test/train.csv")
kaggle_train = pd.get_dummies(kaggle_train)
kaggle_train = kaggle_train.fillna(kaggle_train.mean())
kaggle_train = kaggle_train.astype(int)
kaggle_train

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,65,8450,7,5,2003,2003,196,706,...,0,0,0,1,0,0,0,0,1,0
1,2,20,80,9600,6,8,1976,1976,0,978,...,0,0,0,1,0,0,0,0,1,0
2,3,60,68,11250,7,5,2001,2002,162,486,...,0,0,0,1,0,0,0,0,1,0
3,4,70,60,9550,7,5,1915,1970,0,216,...,0,0,0,1,1,0,0,0,0,0
4,5,60,84,14260,8,5,2000,2000,350,655,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,62,7917,6,5,1999,2000,0,0,...,0,0,0,1,0,0,0,0,1,0
1456,1457,20,85,13175,6,6,1978,1988,119,790,...,0,0,0,1,0,0,0,0,1,0
1457,1458,70,66,9042,7,9,1941,2006,0,275,...,0,0,0,1,0,0,0,0,1,0
1458,1459,20,68,9717,5,6,1950,1996,0,49,...,0,0,0,1,0,0,0,0,1,0


In [15]:
print(kaggle_test.columns)
kaggle_test_Id=kaggle_test['Id']
kaggle_train_Id=kaggle_train['Id']
x_test= kaggle_test.drop(columns=['Id']) 
x_train = kaggle_train.drop(columns=['Id']) 



Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       ...
       'SaleType_ConLw', 'SaleType_New', 'SaleType_Oth', 'SaleType_WD',
       'SaleCondition_Abnorml', 'SaleCondition_AdjLand',
       'SaleCondition_Alloca', 'SaleCondition_Family', 'SaleCondition_Normal',
       'SaleCondition_Partial'],
      dtype='object', length=270)


In [16]:
sample_submission = pd.read_csv('../csvfile/kaggle_test/sample_submission.csv')

In [18]:
y_train = sample_submission.drop(columns=['Id'])
y_train

Unnamed: 0,SalePrice
0,169277.052498
1,187758.393989
2,183583.683570
3,179317.477511
4,150730.079977
...,...
1454,167081.220949
1455,164788.778231
1456,219222.423400
1457,184924.279659


In [42]:
if x_train.shape[0] != y_train.shape[0]:
    min_samples = min(x_train.shape[0],x_test.shape[0], y_train.shape[0])
    x_train = x_train[:min_samples]
    x_test = x_test[:min_samples]
    y_train = y_train[:min_samples]
    




In [40]:

same_columns = x_train.columns.intersection(x_test.columns)

x_train = x_train[same_columns]
x_test = x_test[same_columns]

In [41]:
x_test.info()
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Columns: 269 entries, MSSubClass to SaleCondition_Partial
dtypes: int32(269)
memory usage: 1.5 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Columns: 269 entries, MSSubClass to SaleCondition_Partial
dtypes: int32(269)
memory usage: 1.5 MB


In [43]:

from sklearn.linear_model import LinearRegression


model_lr = LinearRegression()
model_lr.fit(x_train,y_train)

In [44]:
y_pred = model_lr.predict(x_test)

In [45]:
y_pred

array([[184811.56926979],
       [191044.26493006],
       [178225.65342901],
       ...,
       [180760.05148002],
       [154244.48055707],
       [178255.06485979]])

In [55]:
from sklearn.discriminant_analysis import softmax


model = Sequential()

model.add(Dense(units=200,activation='tanh' ))
model.add(Dense(units=10,activation='sigmoid' ))



#모델 완성

NameError: name 'x_test' is not defined

ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- Condition2_RRAe
- Condition2_RRAn
- Condition2_RRNn
- Electrical_Mix
- Exterior1st_ImStucc
- ...


In [4]:
x_test.info()
x_train.info()

NameError: name 'x_test' is not defined

In [56]:
from sklearn import metrics


model.compile(optimizer='adam', loss='huber_loss', metrics=['mae'], loss_weights=None,
              sample_weight_mode=None, weighted_metrics=None)

In [57]:
model.fit(y_pred,y_train,epochs=100,batch_size=32)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x1b4aa938c10>