In [3]:
# COUNTRY+PROVINCE+VARIETY+WINERY+PRICE

import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

# Load data
df = pd.read_csv('decisiontree_dataframe.csv', delimiter=',')


df['points_nominal'] = pd.cut(df['points'], [80, 84, 88, 92, 96, 100], 
                              labels=['normal quality', 'good quality', 'high quality', 
                                      'very high quality', 'excellent quality'])
df = df.drop(columns=['points'])

df= df.dropna(subset=['country', 'province', 'variety', 'winery','price','points_nominal'])

Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1

# identify the outliers using the IQR method
outliers = df[(df['price'] < (Q1 - 1.5 * IQR)) | (df['price'] > (Q3 + 1.5 * IQR))]
# drop the outlier rows from the DataFrame
df = df.drop(outliers.index)

df.info()
# Define input and output features
X = df[['country', 'province', 'variety', 'winery','price']]
y = df['points_nominal']

# One-hot encode input features
encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)


# Fit Gradient Boosting model
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Evaluate model
score = model.score(X_test, y_test)
print('Accuracy:', score)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 187705 entries, 1 to 217779
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype   
---  ------          --------------   -----   
 0   country         187705 non-null  object  
 1   price           187705 non-null  float64 
 2   province        187705 non-null  object  
 3   variety         187705 non-null  object  
 4   winery          187705 non-null  object  
 5   color           178986 non-null  object  
 6   points_nominal  187705 non-null  category
dtypes: category(1), float64(1), object(5)
memory usage: 10.2+ MB
Accuracy: 0.6599984017474229


In [7]:
# COUNTRY+PROVINCE+VARIETY+WINERY+PRICE

import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

# Load data
dftotal = pd.read_csv('combined_dataset.csv', delimiter=',')

# Create the second DataFrame
df = pd.read_csv('decisiontree_dataframe.csv', delimiter=',')

# Take the 'A' column from df1 and add it to df2
df['region1'] = dftotal['region_1']

df['points_nominal'] = pd.cut(df['points'], [80, 84, 88, 92, 96, 100], 
                              labels=['normal quality', 'good quality', 'high quality', 
                                      'very high quality', 'excellent quality'])
df = df.drop(columns=['points'])

df= df.dropna(subset=['country', 'province', 'variety', 'winery','price','region1','points_nominal'])

Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1

# identify the outliers using the IQR method
outliers = df[(df['price'] < (Q1 - 1.5 * IQR)) | (df['price'] > (Q3 + 1.5 * IQR))]
# drop the outlier rows from the DataFrame
df = df.drop(outliers.index)

df.info()
# Define input and output features
X = df[['country', 'province', 'variety', 'winery','price', 'region1']]
y = df['points_nominal']

# One-hot encode input features
encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)


# Fit Gradient Boosting model
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Evaluate model
score = model.score(X_test, y_test)
print('Accuracy:', score)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 158266 entries, 2 to 217749
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype   
---  ------          --------------   -----   
 0   country         158266 non-null  object  
 1   price           158266 non-null  float64 
 2   province        158266 non-null  object  
 3   variety         158266 non-null  object  
 4   winery          158266 non-null  object  
 5   color           150994 non-null  object  
 6   region1         158266 non-null  object  
 7   points_nominal  158266 non-null  category
dtypes: category(1), float64(1), object(6)
memory usage: 9.8+ MB
Accuracy: 0.6001769128704113


  exec(code_obj, self.user_global_ns, self.user_ns)


In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from scipy.stats import randint

# Load data
df = pd.read_csv('decisiontree_dataframe.csv', delimiter=',')

# Define target variable and input features
df['points_nominal'] = pd.cut(df['points'], [80, 84, 88, 92, 96, 100], 
                              labels=['normal quality', 'good quality', 'high quality', 
                                      'very high quality', 'excellent quality'])
df = df.drop(columns=['points'])
df = df.dropna(subset=['country', 'province', 'variety', 'winery','price','points_nominal'])
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1
outliers = df[(df['price'] < (Q1 - 1.5 * IQR)) | (df['price'] > (Q3 + 1.5 * IQR))]
df = df.drop(outliers.index)
X = df[['country', 'province', 'variety', 'winery', 'price']]
y = df['points_nominal']

# One-hot encode input features
encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Define random forest classifier and parameter distributions for randomized search
rf = RandomForestClassifier(random_state=42)
param_dist = {
    'n_estimators': randint(10, 500),
    'max_depth': [3, 5, 10, None],
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
    'max_features': ['auto', 'sqrt', 'log2', None],
    'bootstrap': [True, False]
}

# Perform randomized search for hyperparameter tuning
rf_random = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=50, cv=3, random_state=42)
rf_random.fit(X_train, y_train)

# Print best hyperparameters and accuracy score
print('Best hyperparameters:', rf_random.best_params_)
print('Accuracy:', rf_random.best_score_)

ParserError: ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [14]:
# COUNTRY+COLOR+VARIETY+WINERY+PRICE

import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

# Load data
df = pd.read_csv('decisiontree_dataframe.csv', delimiter=',')

df['points_nominal'] = pd.cut(df['points'], [80, 84, 88, 92, 96, 100], 
                              labels=['normal quality', 'good quality', 'high quality', 
                                      'very high quality', 'excellent quality'])
df = df.drop(columns=['points'])

df= df.dropna(subset=['country', 'color', 'variety', 'winery','price','points_nominal'])

Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1

# identify the outliers using the IQR method
outliers = df[(df['price'] < (Q1 - 1.5 * IQR)) | (df['price'] > (Q3 + 1.5 * IQR))]
# drop the outlier rows from the DataFrame
df = df.drop(outliers.index)

df.info()
# Define input and output features
X = df[['country', 'variety', 'winery','price','color']]
y = df['points_nominal']

# One-hot encode input features
encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Fit decision tree model
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Evaluate model
score = model.score(X_test, y_test)
print('Accuracy:', score)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 179174 entries, 1 to 217779
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype   
---  ------          --------------   -----   
 0   country         179174 non-null  object  
 1   price           179174 non-null  float64 
 2   province        179174 non-null  object  
 3   variety         179174 non-null  object  
 4   winery          179174 non-null  object  
 5   color           179174 non-null  object  
 6   points_nominal  179174 non-null  category
dtypes: category(1), float64(1), object(5)
memory usage: 9.7+ MB
Accuracy: 0.6580717175945305


In [15]:
# COUNTRY+PROVINCE+VARIETY+WINERY+PRICE

import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

# Load data
df = pd.read_csv('decisiontree_dataframe.csv', delimiter=',')

df['points_nominal'] = pd.cut(df['points'], [80, 84, 88, 92, 96, 100], 
                              labels=['normal quality', 'good quality', 'high quality', 
                                      'very high quality', 'excellent quality'])
df = df.drop(columns=['points'])

df= df.dropna(subset=['country', 'province', 'variety', 'winery','price','points_nominal'])

Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1

# identify the outliers using the IQR method
outliers = df[(df['price'] < (Q1 - 1.5 * IQR)) | (df['price'] > (Q3 + 1.5 * IQR))]
# drop the outlier rows from the DataFrame
df = df.drop(outliers.index)

df.info()
# Define input and output features
X = df[['country', 'province', 'variety', 'winery','price']]
y = df['points_nominal']

# One-hot encode input features
encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Fit decision tree model
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Evaluate model
score = model.score(X_test, y_test)
print('Accuracy:', score)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 179174 entries, 1 to 217779
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype   
---  ------          --------------   -----   
 0   country         179174 non-null  object  
 1   price           179174 non-null  float64 
 2   province        179174 non-null  object  
 3   variety         179174 non-null  object  
 4   winery          179174 non-null  object  
 5   color           179174 non-null  object  
 6   points_nominal  179174 non-null  category
dtypes: category(1), float64(1), object(5)
memory usage: 9.7+ MB
Accuracy: 0.5810799497697782


In [10]:
is_country = ['is Germany', 'is Austria', 'is Australia', 'is Argentina', 'is Chile', 'is Portugal', 'is Spain', 'is Italy', 'is France', 'is US', 'is Australian']
is_year = ['is 1997', 'is 1998', 'is 1999', 'is 2000', 'is 2001', 'is 2002', 'is 2003', 'is 2004', 'is 2005', 'is 2006', 'is 2007', 'is 2008', 'is 2009', 'is 2010', 'is 2011', 'is 2012', 'is 2013', 'is 2014', 'is 2015', 'is 2016', 'is 2017']
is_region = ['is European', 'is North American', 'is South American', 'is Australian']
is_variety = ['is Rosé', 'is Merlot', 'is Syrah', 'is Riesling', 'is Sauvignon Blanc', 'is Bordeaux-style Red Blend', 'is Red Blend', 'is Cabernet Sauvignon', 'is Chardonnay', 'is Pinot Noir']
is_color = ['is Rosé', 'is Bordeaux-style Red Blend', 'is Red Blend', 'is Red', 'is White', 'is Sparkling']

In [20]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

# Load the merged_dataframes_total dataframe
merged_dataframes_total = pd.read_csv('binarydataframe.csv')

# calculate the interquartile range for the 'price' column
Q1 = merged_dataframes_total['price'].quantile(0.25)
Q3 = merged_dataframes_total['price'].quantile(0.75)
IQR = Q3 - Q1

# identify the outliers using the IQR method
outliers = merged_dataframes_total[(merged_dataframes_total['price'] < (Q1 - 1.5 * IQR)) | (merged_dataframes_total['price'] > (Q3 + 1.5 * IQR))]

# drop the outlier rows from the DataFrame
merged_dataframes_total = merged_dataframes_total.drop(outliers.index)

merged_dataframes_total = merged_dataframes_total.dropna(subset=['price'])

null_columns = merged_dataframes_total.columns[merged_dataframes_total.isnull().any()]
print('Null Columns \n', merged_dataframes_total[null_columns].isnull().sum().head(50))
 

merged_dataframes_total['points'] = merged_dataframes_total['points'].astype(str)

# Split the data into training and testing sets
X = merged_dataframes_total[is_country+is_variety+['price']] # drop the target variable 'points'
y = merged_dataframes_total['points']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a decision tree classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the model's accuracy and confusion matrix
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print("Accuracy:", acc)
print("Confusion Matrix:\n", cm)

# Visualize the decision tree
plt.figure(figsize=(20,10))
plot_tree(clf, filled=True, feature_names=X.columns, class_names=y.unique())
plt.show()

Null Columns 
 is 1997    87639
is 1998    87639
is 1999    87639
is 2000    87639
is 2001    87639
is 2002    87639
is 2003    87639
is 2004    87639
is 2005    87639
is 2006    87639
is 2007    87639
is 2008    87639
is 2009    87639
is 2010    87639
is 2011    87639
is 2012    87639
is 2013    87639
is 2014    87639
is 2015    87639
is 2016    87639
is 2017    87639
dtype: int64
Accuracy: 0.1872544856141841
Confusion Matrix:
 [[   1    1    7    7   34   36   33   36   27    4    7    3    0    1
     0    0    0    0    0    0]
 [   0    5   12    6   57   36   58   80   48    6   15    4    3    1
     1    0    0    0    0    0]
 [   0    3   16   18   71  103  124  226  143   19   60   21   15    1
     0    0    0    0    0    0]
 [   0    5   14   59  195  183  164  335  201   25   74   26   19    0
     0    0    0    0    0    0]
 [   1    4   18   43  260  342  382  678  382   54  195   52   59    6
     2    0    0    0    0    0]
 [   0    0   10   33  244  478  484  899 

KeyboardInterrupt: KeyboardInterrupt: 

In [2]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split

# Load data
df = pd.read_csv('decisiontree_dataframe.csv', delimiter=';')

df['points_nominal'] = pd.cut(df['points'], [80, 84, 88, 92, 96, 100], 
                              labels=['normal quality', 'good quality', 'high quality', 
                                      'very high quality', 'excellent quality'])

df = df.drop(columns=['points'])
df = df.dropna()
df.info()

# Define input and output features
X = df[['country', 'province', 'variety', 'winery', 'color']]
y = df['points_nominal']

# Ordinal encode input features
encoder = OrdinalEncoder()
X_encoded = encoder.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Define parameter grid for random forest model
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Fit random forest model with hyperparameter tuning
model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(model, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print best hyperparameters and accuracy on test set
print('Best hyperparameters:', grid_search.best_params_)
score = grid_search.score(X_test, y_test)
print('Accuracy:', score)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 207029 entries, 0 to 207931
Data columns (total 6 columns):
 #   Column          Non-Null Count   Dtype   
---  ------          --------------   -----   
 0   country         207029 non-null  object  
 1   province        207029 non-null  object  
 2   variety         207029 non-null  object  
 3   winery          207029 non-null  object  
 4   color           207029 non-null  object  
 5   points_nominal  207029 non-null  category
dtypes: category(1), object(5)
memory usage: 9.7+ MB


TerminatedWorkerError: TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGKILL(-9)}

In [17]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Load data
df = pd.read_csv('decisiontree_dataframe.csv', delimiter=',')

df['points_nominal'] = pd.cut(df['points'], [80, 84, 88, 92, 96, 100], 
                              labels=['normal quality', 'good quality', 'high quality', 
                                      'very high quality', 'excellent quality'])

df = df.drop(columns=['points'])
df= df.dropna()
df.info()

# Define input and output features
X = df[['country', 'province', 'variety', 'winery', 'price']]
y = df['points_nominal']

# One-hot encode input features
encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Fit random forest model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Evaluate model
score = model.score(X_test, y_test)
print('Accuracy:', score)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 190701 entries, 1 to 217779
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype   
---  ------          --------------   -----   
 0   country         190701 non-null  object  
 1   price           190701 non-null  float64 
 2   province        190701 non-null  object  
 3   variety         190701 non-null  object  
 4   winery          190701 non-null  object  
 5   color           190701 non-null  object  
 6   points_nominal  190701 non-null  category
dtypes: category(1), float64(1), object(5)
memory usage: 10.4+ MB


KeyboardInterrupt: KeyboardInterrupt: 