In [3]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split

# Load data
df = pd.read_csv('decisiontree_dataframe.csv', delimiter=';')

df['points_nominal'] = pd.cut(df['points'], [80, 84, 88, 92, 96, 100], 
                              labels=['normal quality', 'good quality', 'high quality', 
                                      'very high quality', 'excellent quality'])

df = df.drop(columns=['points'])
df= df.dropna()
df.info()

# Define input and output features
X = df[['country', 'province', 'variety', 'winery', 'color']]
y = df['points_nominal']

# One-hot encode input features
encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Define decision tree model
base_model = DecisionTreeClassifier()

# Define bagging classifier with decision tree base model
model = BaggingClassifier(base_estimator=base_model, n_estimators=50)

# Fit bagging model
model.fit(X_train, y_train)

# Evaluate model
score = model.score(X_test, y_test)
print('Accuracy:', score)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 207029 entries, 0 to 207931
Data columns (total 6 columns):
 #   Column          Non-Null Count   Dtype   
---  ------          --------------   -----   
 0   country         207029 non-null  object  
 1   province        207029 non-null  object  
 2   variety         207029 non-null  object  
 3   winery          207029 non-null  object  
 4   color           207029 non-null  object  
 5   points_nominal  207029 non-null  category
dtypes: category(1), object(5)
memory usage: 9.7+ MB
Accuracy: 0.5832005023426556


In [16]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split

# Load data
df = pd.read_csv('decisiontree_dataframe.csv', delimiter=';')

df['points_nominal'] = pd.cut(df['points'], [80, 84, 88, 92, 96, 100], 
                              labels=['normal quality', 'good quality', 'high quality', 
                                      'very high quality', 'excellent quality'])

df = df.drop(columns=['points'])
df= df.dropna()
df.info()

# Define input and output features
X = df[['country', 'province', 'variety', 'winery', 'color']]
y = df['points_nominal']

# One-hot encode input features
encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Initialize SGD classifier
model = SGDClassifier(loss='modified_huber', penalty='elasticnet', max_iter=1000, tol=1e-3, random_state=42)

# Partially fit the model using the training data
batch_size = 4500
for i in range(0, X_train.get_shape()[0], batch_size):
    X_batch = X_train[i:i+batch_size]
    y_batch = y_train[i:i+batch_size]
    model.partial_fit(X_batch, y_batch, classes=y.unique())

# Evaluate model
score = model.score(X_test, y_test)
print('Accuracy:', score)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 207029 entries, 0 to 207931
Data columns (total 6 columns):
 #   Column          Non-Null Count   Dtype   
---  ------          --------------   -----   
 0   country         207029 non-null  object  
 1   province        207029 non-null  object  
 2   variety         207029 non-null  object  
 3   winery          207029 non-null  object  
 4   color           207029 non-null  object  
 5   points_nominal  207029 non-null  category
dtypes: category(1), object(5)
memory usage: 9.7+ MB
Accuracy: 0.5004105685166401


In [23]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KDTree
from sklearn.model_selection import train_test_split

# Load data
df = pd.read_csv('decisiontree_dataframe.csv', delimiter=';')

df['points_nominal'] = pd.cut(df['points'], [80, 84, 88, 92, 96, 100], 
                              labels=['normal quality', 'good quality', 'high quality', 'very high quality', 'excellent quality'])

df = df.drop(columns=['points'])
df = df.dropna()
df.info()

# Encode categorical features
le = LabelEncoder()
df['country'] = le.fit_transform(df['country'])
df['province'] = le.fit_transform(df['province'])
df['variety'] = le.fit_transform(df['variety'])
df['winery'] = le.fit_transform(df['winery'])
df['color'] = le.fit_transform(df['color'])
df['points_nominal'] = le.fit_transform(df['points_nominal'])

# Define input and output features
X = df[['country', 'province', 'variety', 'winery', 'color']]
y = df['points_nominal']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit KDTree model
kdtree = KDTree(X_train)

# Find k nearest neighbors for each point in test set
k = 10
distances, indices = kdtree.query(X_test, k)

# Make predictions using majority vote of k nearest neighbors
y_pred = []
for i in range(len(X_test)):
    neighbors = y_train.iloc[indices[i]]
    y_pred.append(neighbors.value_counts().index[0])

# Evaluate model
accuracy = sum(y_pred == y_test) / len(y_test)
print('Accuracy:', accuracy)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 207029 entries, 0 to 207931
Data columns (total 6 columns):
 #   Column          Non-Null Count   Dtype   
---  ------          --------------   -----   
 0   country         207029 non-null  object  
 1   province        207029 non-null  object  
 2   variety         207029 non-null  object  
 3   winery          207029 non-null  object  
 4   color           207029 non-null  object  
 5   points_nominal  207029 non-null  category
dtypes: category(1), object(5)
memory usage: 9.7+ MB
Accuracy: 0.5268560112061054


In [24]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

# Load data
df = pd.read_csv('decisiontree_dataframe.csv', delimiter=';')

df['points_nominal'] = pd.cut(df['points'], [80, 84, 88, 92, 96, 100], 
                              labels=['normal quality', 'good quality', 'high quality', 
                                      'very high quality', 'excellent quality'])

df = df.drop(columns=['points'])
df= df.dropna()
df.info()

# Define input and output features
X = df[['country', 'province', 'variety', 'winery', 'color']]
y = df['points_nominal']

# One-hot encode input features
encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Fit AdaBoost model with decision tree classifier as base estimator
base_estimator = DecisionTreeClassifier(max_depth=2)
model = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=100)
model.fit(X_train, y_train)

# Evaluate model
score = model.score(X_test, y_test)
print('Accuracy:', score)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 207029 entries, 0 to 207931
Data columns (total 6 columns):
 #   Column          Non-Null Count   Dtype   
---  ------          --------------   -----   
 0   country         207029 non-null  object  
 1   province        207029 non-null  object  
 2   variety         207029 non-null  object  
 3   winery          207029 non-null  object  
 4   color           207029 non-null  object  
 5   points_nominal  207029 non-null  category
dtypes: category(1), object(5)
memory usage: 9.7+ MB
Accuracy: 0.4709462396754094


In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Load data
df = pd.read_csv('decisiontree_dataframe.csv', delimiter=';')

df['points_nominal'] = pd.cut(df['points'], [80, 84, 88, 92, 96, 100], 
                              labels=['normal quality', 'good quality', 'high quality', 
                                      'very high quality', 'excellent quality'])

df = df.drop(columns=['points'])
df= df.dropna()
df.info()

# Define input and output features
X = df[['country', 'province', 'variety', 'winery', 'color']]
y = df['points_nominal']

# One-hot encode input features
encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Initialize Online Random Forest model
model = RandomForestClassifier(n_estimators=100, max_depth=10, oob_score=True)

# Initialize an empty buffer to store the training data
X_buffer = []
y_buffer = []

# Define the size of the buffer and the size of the mini-batches
buffer_size = 10000
batch_size = 1000

# Fit the model to the data in mini-batches
for i in range(0, X_train.get_shape()[0], batch_size):
    X_batch = X_train[i:i+batch_size]
    y_batch = y_train[i:i+batch_size]
    
    # Add the mini-batch to the buffer
    X_buffer.append(X_batch)
    y_buffer.append(y_batch)
    
    # If the buffer is full, fit the model to the buffer and clear it
    if len(X_buffer) == buffer_size // batch_size:
        model.partial_fit(X_buffer, y_buffer, classes=y.unique())
        X_buffer = []
        y_buffer = []

# If there is any data left in the buffer, fit the model to it
if len(X_buffer) > 0:
    model.partial_fit(X_buffer, y_buffer, classes=y.unique())

# Evaluate model
score = model.score(X_test, y_test)
print('Accuracy:', score)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 207029 entries, 0 to 207931
Data columns (total 6 columns):
 #   Column          Non-Null Count   Dtype   
---  ------          --------------   -----   
 0   country         207029 non-null  object  
 1   province        207029 non-null  object  
 2   variety         207029 non-null  object  
 3   winery          207029 non-null  object  
 4   color           207029 non-null  object  
 5   points_nominal  207029 non-null  category
dtypes: category(1), object(5)
memory usage: 9.7+ MB


AttributeError: AttributeError: 'RandomForestClassifier' object has no attribute 'partial_fit'