In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import PowerTransformer
from sklearn.datasets import make_regression
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBRegressor
import matplotlib.pyplot as plt

In [None]:
!pip install -U scikit-learn



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content/drive/MyDrive/pipe/lab4sem

/content/drive/MyDrive/pipe/lab4sem


In [None]:
data = pd.read_csv('Laptop_price.csv')

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Brand             1000 non-null   object 
 1   Processor_Speed   1000 non-null   float64
 2   RAM_Size          1000 non-null   int64  
 3   Storage_Capacity  1000 non-null   int64  
 4   Screen_Size       1000 non-null   float64
 5   Weight            1000 non-null   float64
 6   Price             1000 non-null   float64
dtypes: float64(4), int64(2), object(1)
memory usage: 54.8+ KB


In [None]:
y = data[['Price']]

In [None]:
X = data.drop('Price', axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=20)

In [None]:
num_columns = list(X_train.select_dtypes(exclude=['object']).columns)
num_columns

['Processor_Speed', 'RAM_Size', 'Storage_Capacity', 'Screen_Size', 'Weight']

In [None]:
cat_columns = list(X_train.select_dtypes(include=['object']).columns)
cat_columns

['Brand']

In [None]:
numerical = Pipeline(steps=[
    ("simtle_imputer", SimpleImputer(strategy='constant')),
    ("power_transform", PowerTransformer()),
    ('Scaler',StandardScaler())
])

In [None]:
categorical = Pipeline(steps=[
    ("simple_limiter", SimpleImputer(strategy='constant')),
    ("OneHotEncoder", OneHotEncoder(handle_unknown='ignore',
                                    sparse_output=False))
])

In [None]:
ct = ColumnTransformer([
    ("numerical", numerical, num_columns ),
     ("categorical", categorical, cat_columns )
])

In [None]:
pipe = Pipeline(steps=[
    ("ct", ct),
    ("XGBRegressor", XGBRegressor())
])

In [None]:
param_grid={
    'XGBRegressor__n_estimators' : [50],
    'XGBRegressor__max_depth' : [3],
    'XGBRegressor__gamma' : [6],
    'XGBRegressor__max_leaves' : [0],
    'XGBRegressor__learning_rate' : [1],
    'ct__numerical' : [StandardScaler(), RobustScaler()]
}

In [None]:
clf = GridSearchCV(pipe, param_grid, verbose=3)
clf.fit(X_train, y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV 1/5] END XGBRegressor__gamma=6, XGBRegressor__learning_rate=1, XGBRegressor__max_depth=3, XGBRegressor__max_leaves=0, XGBRegressor__n_estimators=50, ct__numerical=StandardScaler();, score=0.999 total time=   0.6s
[CV 2/5] END XGBRegressor__gamma=6, XGBRegressor__learning_rate=1, XGBRegressor__max_depth=3, XGBRegressor__max_leaves=0, XGBRegressor__n_estimators=50, ct__numerical=StandardScaler();, score=0.999 total time=   0.2s
[CV 3/5] END XGBRegressor__gamma=6, XGBRegressor__learning_rate=1, XGBRegressor__max_depth=3, XGBRegressor__max_leaves=0, XGBRegressor__n_estimators=50, ct__numerical=StandardScaler();, score=0.999 total time=   0.0s
[CV 4/5] END XGBRegressor__gamma=6, XGBRegressor__learning_rate=1, XGBRegressor__max_depth=3, XGBRegressor__max_leaves=0, XGBRegressor__n_estimators=50, ct__numerical=StandardScaler();, score=0.999 total time=   0.0s
[CV 5/5] END XGBRegressor__gamma=6, XGBRegressor__learning_rate=1, XGBRe

In [None]:
#model=clf.best_estimator_

In [None]:
#clf.best_params_

{'XGBRegressor__gamma': 6,
 'XGBRegressor__learning_rate': 1,
 'XGBRegressor__max_depth': 3,
 'XGBRegressor__max_leaves': 0,
 'XGBRegressor__n_estimators': 50,
 'ct__numerical': StandardScaler()}

In [None]:
pip install mlflow



In [None]:
pip install optuna




In [103]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_selector as selector
from xgboost import XGBClassifier
import optuna
import mlflow

In [104]:
%cd /content/drive/MyDrive/pipe/lab4sem

/content/drive/MyDrive/pipe/lab4sem


In [112]:
data = pd.read_csv('bank.csv', sep=';')

In [113]:
y = data['y'].apply(lambda x: 1 if x == 'yes' else 0)
X = data.drop('y', axis=1)

In [114]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [115]:
numeric_features = selector(dtype_include='number')(X)
categorical_features = selector(dtype_exclude='number')(X)

In [116]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
    ])

In [117]:
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', XGBClassifier())])

In [119]:
def objective(trial):
    params = {
        'classifier__n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'classifier__learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'classifier__max_depth': trial.suggest_int('max_depth', 3, 10),
        'classifier__subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'classifier__colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'classifier__gamma': trial.suggest_int('gamma', 0, 5),
        'classifier__min_child_weight': trial.suggest_int('min_child_weight', 1, 10)
    }

    model.set_params(**params)

    model.fit(X_train, y_train)

    score = model.score(X_test, y_test)

    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

best_params = study.best_params
best_score = study.best_value



[I 2024-05-16 14:36:50,215] A new study created in memory with name: no-name-79f894bb-3e82-47d6-9c1d-ed32b896ccc1
[I 2024-05-16 14:36:51,620] Trial 0 finished with value: 0.9186695799951444 and parameters: {'n_estimators': 207, 'learning_rate': 0.09917306309506814, 'max_depth': 3, 'subsample': 0.6945574712670508, 'colsample_bytree': 0.8644530577910101, 'gamma': 2, 'min_child_weight': 2}. Best is trial 0 with value: 0.9186695799951444.
[I 2024-05-16 14:36:56,391] Trial 1 finished with value: 0.9181840252488468 and parameters: {'n_estimators': 248, 'learning_rate': 0.10406269870151562, 'max_depth': 4, 'subsample': 0.6487980833970991, 'colsample_bytree': 0.9429479538479887, 'gamma': 5, 'min_child_weight': 1}. Best is trial 0 with value: 0.9186695799951444.
[I 2024-05-16 14:36:58,908] Trial 2 finished with value: 0.9189123573682932 and parameters: {'n_estimators': 669, 'learning_rate': 0.07872494376202409, 'max_depth': 3, 'subsample': 0.6343997239754463, 'colsample_bytree': 0.8878608545985

In [120]:
with mlflow.start_run():
    mlflow.log_params(best_params)
    mlflow.log_metric('accuracy', best_score)

In [None]:
#lab 4
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=9f3152f3860098c5f5dd22ea4728ffb51366f19e49a33630a3352aa845adcb72
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [None]:
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession

In [None]:
APP_NAME = "DataFrames"
SPARK_URL = "local[*]"

In [None]:
spark = SparkSession.builder.appName(APP_NAME) \
        .config('spark.ui.showConsoleProgress', 'false') \
        .getOrCreate()

In [None]:
%cd /content/drive/MyDrive/pipe/lab4sem

/content/drive/MyDrive/pipe/lab4sem


In [None]:
taxi = spark.read.load('taxi.csv',
                       format='csv', header='true', inferSchema='true')

In [None]:
taxi.registerTempTable("taxi")



In [None]:
print("Количество объектов в датафрейме:", taxi.count())

Количество объектов в датафрейме: 128974


In [None]:
taxi.show(n=5)

+----------+----+------+-------+
|      date|hour|minute|pickups|
+----------+----+------+-------+
|2009-01-01|   0|     0|   24.0|
|2009-01-01|   0|    30|   35.0|
|2009-01-01|   1|     0|   25.0|
|2009-01-01|   1|    30|   25.0|
|2009-01-01|   2|     0|   16.0|
+----------+----+------+-------+
only showing top 5 rows



In [None]:
taxi['date', 'hour', 'minute'].show(5)

+----------+----+------+
|      date|hour|minute|
+----------+----+------+
|2009-01-01|   0|     0|
|2009-01-01|   0|    30|
|2009-01-01|   1|     0|
|2009-01-01|   1|    30|
|2009-01-01|   2|     0|
+----------+----+------+
only showing top 5 rows



In [None]:
taxi.describe().show()

+-------+------------------+------------------+------------------+
|summary|              hour|            minute|           pickups|
+-------+------------------+------------------+------------------+
|  count|            128974|            128974|            128969|
|   mean|11.566509529052366|15.004419495402175|29.009451883786028|
| stddev| 6.908556452594711|15.000057500526209|  22.4493784836831|
|    min|                 0|                 0|               1.0|
|    max|                23|                30|             310.0|
+-------+------------------+------------------+------------------+



In [None]:
print(taxi.summary().show())


+-------+------------------+------------------+------------------+
|summary|              hour|            minute|           pickups|
+-------+------------------+------------------+------------------+
|  count|            128974|            128974|            128969|
|   mean|11.566509529052366|15.004419495402175|29.009451883786028|
| stddev| 6.908556452594711|15.000057500526209|  22.4493784836831|
|    min|                 0|                 0|               1.0|
|    25%|                 6|                 0|              11.0|
|    50%|                12|                30|              27.0|
|    75%|                18|                30|              40.0|
|    max|                23|                30|             310.0|
+-------+------------------+------------------+------------------+

None


In [None]:
taxi.fillna(0).describe().show()

+-------+------------------+------------------+------------------+
|summary|              hour|            minute|           pickups|
+-------+------------------+------------------+------------------+
|  count|            128974|            128974|            128974|
|   mean|11.566509529052366|15.004419495402175| 29.00832725975778|
| stddev| 6.908556452594711|15.000057500526209|22.449669931429067|
|    min|                 0|                 0|               0.0|
|    max|                23|                30|             310.0|
+-------+------------------+------------------+------------------+



In [None]:
print('Количество объектов без пропусков до удаления пропусков')
print('\nДо удаления пропусков   :', taxi.dropna(how='any', subset='pickups').count())

Количество объектов без пропусков до удаления пропусков

До удаления пропусков   : 128969


In [None]:
taxi = taxi.fillna(0)

In [None]:
print('После удаления пропусков:', taxi.dropna(how='any', subset='pickups').count())

После удаления пропусков: 128974


In [None]:
result = spark.sql("""
SELECT   *
FROM     taxi
ORDER BY pickups DESC
--LIMIT    5
;
""")

print(result.show(5))

+----------+----+------+-------+
|      date|hour|minute|pickups|
+----------+----+------+-------+
|2015-11-01|   1|    30|  310.0|
|2010-09-23|  22|    30|  288.0|
|2012-03-07|  21|     0|  268.0|
|2011-03-02|  20|    30|  264.0|
|2011-03-02|  18|    30|  263.0|
+----------+----+------+-------+
only showing top 5 rows

None


In [None]:
result = spark.sql("""
SELECT   COUNT(DISTINCT date)
FROM     taxi
WHERE    pickups > 200
;
""")

print(result.show())

+--------------------+
|count(DISTINCT date)|
+--------------------+
|                  21|
+--------------------+

None


In [None]:
print(spark.sql("""
SELECT EXTRACT(MONTH FROM date),
       AVG(pickups)
FROM   taxi
GROUP BY EXTRACT(MONTH FROM date)
ORDER BY AVG(pickups) DESC;
""").show())


+------------------------+------------------+
|extract(MONTH FROM date)|      avg(pickups)|
+------------------------+------------------+
|                       3| 34.61413319776309|
|                      10|31.492839171666343|
|                       2|29.856671982987773|
|                       5| 29.81593638978176|
|                       4|29.313725490196077|
|                       9|29.158446485623003|
|                      11|28.860367558929283|
|                       1|28.559511612021858|
|                       6| 27.03835736129314|
|                       7| 26.45983005021244|
|                      12| 26.45916884626562|
|                       8| 25.88592750533049|
+------------------------+------------------+

None


In [None]:
print(spark.sql("""
SELECT   hour,
         AVG(pickups)
FROM     taxi
GROUP BY hour
ORDER BY AVG(pickups) DESC
""").show(10))

+----+------------------+
|hour|      avg(pickups)|
+----+------------------+
|   8| 48.98208348725527|
|   9| 45.74220335855324|
|  18|45.131967515688444|
|  19| 40.18456995201181|
|  17| 37.68493909191584|
|  12| 36.91678966789668|
|  10|36.391031555637575|
|  14|35.965867158671585|
|   7| 35.94376618571957|
|  13| 35.34939091915836|
+----+------------------+
only showing top 10 rows

None
