In [1]:
# Importing necessary libraries for data manipulation and inspection
import pandas as pd
import numpy as np

# Load the dataset
dataset_path = '50_Startups.csv'
df = pd.read_csv(dataset_path)

# Inspect the first few rows of the dataset
df.head()


Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [2]:
# Rename the columns
new_column_names = {
    'R&D Spend': 'RnD_Expenditure',
    'Administration': 'Admin_Costs',
    'Marketing Spend': 'Marketing_Expenditure',
    'State': 'Location',
    'Profit': 'Net_Profit'
}

df.rename(columns=new_column_names, inplace=True)

# Inspect the dataset to confirm the changes
df.head()


Unnamed: 0,RnD_Expenditure,Admin_Costs,Marketing_Expenditure,Location,Net_Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
# Generate a new column "Industry_Sector" with random values
np.random.seed(0)  # For reproducibility
industry_sectors = ['Tech', 'Healthcare', 'Retail', 'Finance', 'Energy']
df['Industry_Sector'] = np.random.choice(industry_sectors, df.shape[0])

# Inspect the dataset to confirm the new column
df.head()


Unnamed: 0,RnD_Expenditure,Admin_Costs,Marketing_Expenditure,Location,Net_Profit,Industry_Sector
0,165349.2,136897.8,471784.1,New York,192261.83,Energy
1,162597.7,151377.59,443898.53,California,191792.06,Tech
2,153441.51,101145.55,407934.54,Florida,191050.39,Finance
3,144372.41,118671.85,383199.62,New York,182901.99,Finance
4,142107.34,91391.77,366168.42,Florida,166187.94,Finance


In [4]:
# Create a new column "Adjusted_Profit" based on a linear relationship with "Net_Profit" and "RnD_Expenditure"
# Create a new column "Adjusted_Profit_With_Noise" based on a linear relationship with "Net_Profit" and "RnD_Expenditure"
# Adding random noise to the relationship using numpy
np.random.seed(0)
noise = np.random.normal(0, 1000, df.shape[0])
df['Adjusted_Profit_With_Noise'] = df['Net_Profit'] + 0.2 * df['RnD_Expenditure'] + noise

# Invent two more relevant columns for the dataset
# 1. "Employee_Count": Randomly generated number of employees in the startup
df['Employee_Count'] = np.random.randint(10, 500, df.shape[0])

# 2. "Investment_Round": Categorical variable indicating the investment round (Seed, Series A, Series B, etc.)
investment_rounds = ['Seed', 'Series A', 'Series B', 'Series C', 'IPO']
df['Investment_Round'] = np.random.choice(investment_rounds, df.shape[0])

# Inspect the dataset to confirm the new columns
df.head()


Unnamed: 0,RnD_Expenditure,Admin_Costs,Marketing_Expenditure,Location,Net_Profit,Industry_Sector,Adjusted_Profit_With_Noise,Employee_Count,Investment_Round
0,165349.2,136897.8,471784.1,New York,192261.83,Energy,227095.722346,289,Series A
1,162597.7,151377.59,443898.53,California,191792.06,Tech,224711.757208,217,Series B
2,153441.51,101145.55,407934.54,Florida,191050.39,Finance,222717.429984,407,Series C
3,144372.41,118671.85,383199.62,New York,182901.99,Finance,214017.365199,383,Series A
4,142107.34,91391.77,366168.42,Florida,166187.94,Finance,196476.96599,351,Series B


In [5]:
# Adding another relevant numerical column
# 1. "Operational_Cost": It's a combination of Admin_Costs and Marketing_Expenditure
df['Operational_Cost'] = df['Admin_Costs'] + df['Marketing_Expenditure']

# Adding 20 more rows for the "Seed" investment round
# These rows are filled with noise, positive and negative profit, and outlier values
np.random.seed(1)  # For reproducibility

# Generate random data for existing columns
additional_data = {
    'RnD_Expenditure': np.random.uniform(0, 200000, 20),
    'Admin_Costs': np.random.uniform(0, 150000, 20),
    'Marketing_Expenditure': np.random.uniform(0, 500000, 20),
    'Location': np.random.choice(['New York', 'California', 'Florida'], 20),
    'Net_Profit': np.random.uniform(-50000, 200000, 20),
    'Industry_Sector': np.random.choice(['Tech', 'Healthcare', 'Retail', 'Finance', 'Energy'], 20),
    'Employee_Count': np.random.randint(10, 500, 20),
    'Investment_Round': ['Seed'] * 20  # All set to "Seed"
}

# Create a DataFrame for additional data
additional_df = pd.DataFrame(additional_data)

# Calculate Adjusted_Profit, Adjusted_Profit_With_Noise, and Operational_Cost for the new rows
additional_df['Adjusted_Profit'] = additional_df['Net_Profit'] + 0.1 * additional_df['RnD_Expenditure']
noise_for_additional_data = np.random.normal(0, 1000, 20)
additional_df['Adjusted_Profit_With_Noise'] = additional_df['Adjusted_Profit'] + noise_for_additional_data
additional_df['Operational_Cost'] = additional_df['Admin_Costs'] + additional_df['Marketing_Expenditure']

# Append the additional data to the original DataFrame
df = pd.concat([df, additional_df], ignore_index=True)

# Inspect the dataset to confirm the new rows and columns
df.tail(25)  # Displaying last 25 rows to include some of the original data and all of the new "Seed" rows


Unnamed: 0,RnD_Expenditure,Admin_Costs,Marketing_Expenditure,Location,Net_Profit,Industry_Sector,Adjusted_Profit_With_Noise,Employee_Count,Investment_Round,Operational_Cost,Adjusted_Profit
45,1000.23,124153.04,1903.93,New York,64926.08,Energy,64688.051698,461,Seed,126056.97,
46,1315.46,115816.21,297114.46,Florida,49490.75,Tech,48501.04664,92,Seed,412930.67,
47,0.0,135426.92,0.0,California,42559.73,Energy,43337.220356,440,Seed,135426.92,
48,542.05,51743.15,0.0,New York,35673.41,Healthcare,34167.922152,237,Series A,51743.15,
49,0.0,116983.8,45173.06,California,14681.4,Energy,14468.65972,158,Seed,162156.86,
50,83404.400941,120111.685301,494430.544453,New York,49419.209246,Retail,56804.550646,408,Seed,614542.229755,57759.64934
51,144064.898688,145239.236358,374082.82719,California,-8661.450721,Retail,5768.58932,151,Seed,519322.063548,5745.039148
52,22.874963,47013.626724,140221.996032,California,181877.145099,Tech,180909.147951,222,Seed,187235.622756,181879.432595
53,60466.514526,103848.39235,394639.664226,California,36941.464936,Healthcare,42223.865461,126,Seed,498488.056576,42988.116389
54,29351.178163,131458.372844,51613.003289,New York,137703.025784,Retail,140372.9982,309,Seed,183071.376133,140638.1436


In [6]:
list(df)

['RnD_Expenditure',
 'Admin_Costs',
 'Marketing_Expenditure',
 'Location',
 'Net_Profit',
 'Industry_Sector',
 'Adjusted_Profit_With_Noise',
 'Employee_Count',
 'Investment_Round',
 'Operational_Cost',
 'Adjusted_Profit']

In [7]:
del df['Adjusted_Profit']

In [9]:
df.rename(columns ={"Adjusted_Profit_With_Noise": "Profit"},inplace=True)

In [10]:
df

Unnamed: 0,RnD_Expenditure,Admin_Costs,Marketing_Expenditure,Location,Net_Profit,Industry_Sector,Profit,Employee_Count,Investment_Round,Operational_Cost
0,165349.200000,136897.800000,471784.100000,New York,192261.830000,Energy,227095.722346,289,Series A,608681.900000
1,162597.700000,151377.590000,443898.530000,California,191792.060000,Tech,224711.757208,217,Series B,595276.120000
2,153441.510000,101145.550000,407934.540000,Florida,191050.390000,Finance,222717.429984,407,Series C,509080.090000
3,144372.410000,118671.850000,383199.620000,New York,182901.990000,Finance,214017.365199,383,Series A,501871.470000
4,142107.340000,91391.770000,366168.420000,Florida,166187.940000,Finance,196476.965990,351,Series B,457560.190000
...,...,...,...,...,...,...,...,...,...,...
65,134093.502036,47327.344651,26681.272559,California,105423.930052,Tech,120378.804381,335,Seed,74008.617209
66,83460.960473,102975.139152,287058.802746,New York,-21313.506762,Healthcare,-14885.847169,430,Seed,390033.941898
67,111737.965689,125193.850785,73364.287453,New York,187372.314677,Finance,197551.192890,367,Seed,198558.138238
68,28077.387719,2743.241602,294652.768452,New York,62478.033370,Healthcare,64877.719983,258,Seed,297396.010053


In [11]:
df.to_csv("startups_ajustado.csv")

In [12]:
df = pd.read_csv("Crop_recommendation.csv")

In [13]:
df

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,90,42,43,20.879744,82.002744,6.502985,202.935536,rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,rice
4,78,42,42,20.130175,81.604873,7.628473,262.717340,rice
...,...,...,...,...,...,...,...,...
2195,107,34,32,26.774637,66.413269,6.780064,177.774507,coffee
2196,99,15,27,27.417112,56.636362,6.086922,127.924610,coffee
2197,118,33,30,24.131797,67.225123,6.362608,173.322839,coffee
2198,117,32,34,26.272418,52.127394,6.758793,127.175293,coffee


In [14]:
# Cambiar los nombres de las columnas
column_names_map = {
    'N': 'Contenido_de_Nitrogeno',
    'P': 'Contenido_de_Fosforo',
    'K': 'Contenido_de_Potasio',
    'temperature': 'Temperatura_C',
    'humidity': 'Humedad_Relativa',
    'ph': 'Nivel_de_pH',
    'rainfall': 'Precipitacion_mm',
    'label': 'Tipo_de_Cultivo'
}

df.rename(columns=column_names_map, inplace=True)

# Mostrar las primeras filas del dataset con los nuevos nombres de columnas
df.head()


Unnamed: 0,Contenido_de_Nitrogeno,Contenido_de_Fosforo,Contenido_de_Potasio,Temperatura_C,Humedad_Relativa,Nivel_de_pH,Precipitacion_mm,Tipo_de_Cultivo
0,90,42,43,20.879744,82.002744,6.502985,202.935536,rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,rice
4,78,42,42,20.130175,81.604873,7.628473,262.71734,rice


In [15]:
df.Tipo_de_Cultivo.unique()

array(['rice', 'maize', 'chickpea', 'kidneybeans', 'pigeonpeas',
       'mothbeans', 'mungbean', 'blackgram', 'lentil', 'pomegranate',
       'banana', 'mango', 'grapes', 'watermelon', 'muskmelon', 'apple',
       'orange', 'papaya', 'coconut', 'cotton', 'jute', 'coffee'],
      dtype=object)

In [16]:
df.to_csv("tipo_cultivo.csv")

In [None]:


from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns

# Crear el modelo
decision_tree_model = DecisionTreeClassifier(criterion="entropy", random_state=2, max_depth=5)

# Validación cruzada
score = cross_val_score(decision_tree_model, features, target, cv=5)
print('Puntuación de validación cruzada:', score)

# Precisión en entrenamiento
dt_train_accuracy = decision_tree_model.score(x_train, y_train)
print("Precisión en entrenamiento =", dt_train_accuracy)

# Precisión en pruebas
dt_test_accuracy = decision_tree_model.score(x_test, y_test)
print("Precisión en pruebas =", dt_test_accuracy)

# Matriz de confusión
y_pred = decision_tree_model.predict(x_test)
y_true = y_test
from sklearn.metrics import confusion_matrix
cm_dt = confusion_matrix(y_true, y_pred)

# Visualización de la matriz de confusión
f, ax = plt.subplots(figsize=(15, 10))
sns.heatmap(cm_dt, annot=True, linewidth=0.5, fmt=".0f", cmap='viridis', ax=ax)
plt.xlabel("Predicho")
plt.ylabel("Real")
plt.title('Matriz de Confusión')
plt.show()