# Подготовка модели для предсказания погоды


Необходимые импорты и игнорирование предупреждений  


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.calibration import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

import warnings
warnings.filterwarnings("ignore")

# Обзор данных 

* Ознакомление с данными и их структурой.

* Проверка наличия пропущенных значений, выбросов и ошибок в данных.

In [4]:
data = pd.read_csv('weather_data.csv')
data.head(3)

Unnamed: 0,Date/Time,Temp_C,Dew Point Temp_C,Rel Hum_%,Wind Speed_km/h,Visibility_km,Press_kPa,Weather
0,1/1/2012 0:00,-1.8,-3.9,86,4,8.0,101.24,Fog
1,1/1/2012 1:00,-1.8,-3.7,87,4,8.0,101.24,Fog
2,1/1/2012 2:00,-1.8,-3.4,89,7,4.0,101.26,"Freezing Drizzle,Fog"


In [6]:
data = data.rename(columns={'Dew Point Temp_C': 'Dew Point (C)', 'Temp_C': 'Temperature (C)', 'Rel Hum_%': 'Humidity',
                            'Wind Speed_km/h': 'Wind Speed (km/h)','Visibility_km': 'Visibility (km)',
                            'Press_kPa': 'Pressure (kPa)'})
print(f'Типы данных столбцов:\n {data.dtypes}\n')
print(f'Количество пропущенных значений:\n {data.isnull().sum()}')

Типы данных столбцов:
 Date/Time             object
Temperature (C)      float64
Dew Point (C)        float64
Humidity               int64
Wind Speed (km/h)      int64
Visibility (km)      float64
Pressure (kPa)       float64
Weather               object
dtype: object

Количество пропущенных значений:
 Date/Time            0
Temperature (C)      0
Dew Point (C)        0
Humidity             0
Wind Speed (km/h)    0
Visibility (km)      0
Pressure (kPa)       0
Weather              0
dtype: int64


Преобразование столбца `Date/Time` в формат `datetime` для
построения графиков временных рядов

In [8]:
data["Date/Time"] = pd.to_datetime(data["Date/Time"], format="%m/%d/%Y %H:%M")
data.head(3)

Unnamed: 0,Date/Time,Temperature (C),Dew Point (C),Humidity,Wind Speed (km/h),Visibility (km),Pressure (kPa),Weather
0,2012-01-01 00:00:00,-1.8,-3.9,86,4,8.0,101.24,Fog
1,2012-01-01 01:00:00,-1.8,-3.7,87,4,8.0,101.24,Fog
2,2012-01-01 02:00:00,-1.8,-3.4,89,7,4.0,101.26,"Freezing Drizzle,Fog"


# Статистический анализ 

* Вычисления основных статистических характеристик для каждого признака (среднее, медиана, минимум, максимум и т.д.).

In [10]:
data.describe()

Unnamed: 0,Date/Time,Temperature (C),Dew Point (C),Humidity,Wind Speed (km/h),Visibility (km),Pressure (kPa)
count,8784,8784.0,8784.0,8784.0,8784.0,8784.0,8784.0
mean,2012-07-01 23:30:00,8.798144,2.555294,67.431694,14.945469,27.664447,101.051623
min,2012-01-01 00:00:00,-23.3,-28.5,18.0,0.0,0.2,97.52
25%,2012-04-01 11:45:00,0.1,-5.9,56.0,9.0,24.1,100.56
50%,2012-07-01 23:30:00,9.3,3.3,68.0,13.0,25.0,101.07
75%,2012-10-01 11:15:00,18.8,11.8,81.0,20.0,25.0,101.59
max,2012-12-31 23:00:00,33.0,24.4,100.0,83.0,48.3,103.65
std,,11.687883,10.883072,16.918881,8.688696,12.622688,0.844005
