In [5]:
import math
import random
import csv

def load_data(filename):
    with open(filename, newline='') as file:
        reader = csv.reader(file)
        data = [row for row in reader]
    print(f"\nNumber of rows in the dataset: {len(data)}")
    if data:
        print(f"Number of columns in the dataset: {len(data[0])}")
    print("\nFirst 10 rows of data:")
    for row in data[:10]:
        print(row)
    return data

def remove_duplicates_keep_first(data):
    unique_data = []
    seen = set()
    for row in data:
        row_tuple = tuple(row)
        if row_tuple not in seen:
            unique_data.append(row)
            seen.add(row_tuple)
    return unique_data

def convert_numeric_data(data):
    converted_data = []
    if not data:
        return []
    num_cols = len(data[0])
    for row in data:
        converted_row = []
        for i in range(num_cols):
            value = row[i].strip()
            if value:
                try:
                    converted_row.append(int(value))
                except ValueError:
                    try:
                        converted_row.append(float(value))
                    except ValueError:
                        converted_row.append(None)  # Mark non-numeric values as None
            else:
                converted_row.append(None)
        converted_data.append(converted_row)
    return converted_data

def filter_numeric_columns(data):
    numeric_indices = []
    for col in range(len(data[0])):
        try:
            for row in data:
                if row[col] is not None:
                    float(row[col])
            numeric_indices.append(col)
        except ValueError:
            continue
    return [[row[i] for i in numeric_indices] for row in data]

def load_data(filename):
    with open(filename, newline='') as file:
        reader = csv.reader(file)
        data = [row for row in reader]
    print(f"\nNumber of rows in the dataset: {len(data)}")
    if data:
        print(f"Number of columns in the dataset: {len(data[0])}")
    print("\nFirst 10 rows of data:")
    for row in data[:10]:
        print(row)
    return data

def train_test_split(dataset, test_size=0.2, seed=42):
    random.seed(seed)
    random.shuffle(dataset)
    split_idx = int(len(dataset) * (1 - test_size))
    return dataset[:split_idx], dataset[split_idx:]

def get_min_max(data):
    mins = [float('inf')] * len(data[0])
    maxs = [float('-inf')] * len(data[0])
    for row in data:
        for i, val in enumerate(row):
            if val is not None:
                val = float(val)
                mins[i] = min(mins[i], val)
                maxs[i] = max(maxs[i], val)
    return mins, maxs

def scale_data(data, mins, maxs):
    scaled_data = []
    for row in data:
        scaled_row = []
        for i, val in enumerate(row):
            if val is None:
                val = mins[i]
            val = float(val)
            scaled_row.append((val - mins[i]) / (maxs[i] - mins[i]) if maxs[i] > mins[i] else 0.0)
        scaled_data.append(scaled_row)
    return scaled_data

def logistic_regression_fit(train_data, lr=0.1, epochs=500):
    weights = [0.0] * (len(train_data[0]) - 1)
    bias = 0.0
    for _ in range(epochs):
        for row in train_data:
            features = [float(x) for x in row[:-1]]
            target = int(row[-1])
            z = sum(w * f for w, f in zip(weights, features)) + bias
            y_pred = 1 / (1 + math.exp(-z))
            error = y_pred - target
            for i in range(len(weights)):
                weights[i] -= lr * error * features[i]
            bias -= lr * error
    return weights, bias
   

def predict(features, weights, bias):
    z = sum(w * float(f) for w, f in zip(weights, features)) + bias
    return 1 if 1 / (1 + math.exp(-z)) >= 0.5 else 0

def evaluate(test_data, weights, bias):
    correct = sum(1 for row in test_data if predict(row[:-1], weights, bias) == int(row[-1]))
    return correct / len(test_data) if test_data else 0

def main():
    filename = 'wines_SPA.csv'
    data = load_data(filename)
    data = remove_duplicates_keep_first(data)
    data = convert_numeric_data(data)
    data = filter_numeric_columns(data)
    missing_value_count = sum(1 for row in data for value in row if value is None)
    print(f"\nNumber of missing values in the dataset: {missing_value_count}")
    train_data, test_data = train_test_split(data)
    mins, maxs = get_min_max(train_data)
    train_scaled = scale_data(train_data, mins, maxs)
    test_scaled = scale_data(test_data, mins, maxs)
    weights, bias = logistic_regression_fit(train_scaled)
    accuracy = evaluate(test_scaled, weights, bias)
    print(f'Accuracy: {accuracy:.4f}')

if __name__ == "__main__":
    main()


Number of rows in the dataset: 7501
Number of columns in the dataset: 11

First 10 rows of data:
['winery', 'wine', 'year', 'rating', 'num_reviews', 'country', 'region', 'price', 'type', 'body', 'acidity']
['Teso La Monja', 'Tinto', '2013', '4.9', '58', 'Espana', 'Toro', '995', 'Toro Red', '5', '3']
['Artadi', 'Vina El Pison', '2018', '4.9', '31', 'Espana', 'Vino de Espana', '313.5', 'Tempranillo', '4', '2']
['Vega Sicilia', 'Unico', '2009', '4.8', '1793', 'Espana', 'Ribera del Duero', '324.95', 'Ribera Del Duero Red', '5', '3']
['Vega Sicilia', 'Unico', '1999', '4.8', '1705', 'Espana', 'Ribera del Duero', '692.96', 'Ribera Del Duero Red', '5', '3']
['Vega Sicilia', 'Unico', '1996', '4.8', '1309', 'Espana', 'Ribera del Duero', '778.06', 'Ribera Del Duero Red', '5', '3']
['Vega Sicilia', 'Unico', '1998', '4.8', '1209', 'Espana', 'Ribera del Duero', '490', 'Ribera Del Duero Red', '5', '3']
['Vega Sicilia', 'Unico', '2010', '4.8', '1201', 'Espana', 'Ribera del Duero', '349', 'Ribera Del 