In [8]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('../Datasets/processed/Soybeans.csv')

# Check if the 'Variety' column exists in the DataFrame
if 'Variety' in df.columns:
    # Count the occurrences of each unique value in the 'Variety' column
    variety_counts = df['Variety'].value_counts()

    # Print the counts
    print("Counts of different values in the 'Variety' column:")
    print(variety_counts)
else:
    print("'Variety' column not found in the CSV file.")


Counts of different values in the 'Variety' column:
GA PRICHARD    123
NE 3292 C      122
NE 3001        118
GA COOK        104
GA WOODRUFF    104
Name: Variety, dtype: int64


In [2]:
import pandas as pd

GRAIN_TYPE = "Wheat"  # Assuming you have defined GRAIN_TYPE somewhere
URL = "../Datasets/processed/" + GRAIN_TYPE + ".csv"

# Read in csv format
df = pd.read_csv(URL)

# Filter rows where the "Variety" column is "KANSAS"
if 'Variety' in df.columns and 'Phase' in df.columns and 'Attn' in df.columns:
    df_kansas = df[df['Variety'].str.upper() == 'KANSAS']  # This also makes the comparison case-insensitive

    # Calculate the "Phase/Attn" column
    # It's good practice to handle division by zero or invalid data
    df_kansas['Phase/Attn'] = df_kansas['Phase'] / df_kansas['Attn'].replace({0: None})

    # Save the filtered and modified DataFrame to a new CSV file
    df_kansas.to_csv('newWheatData.csv', index=False)
    print('Filtered data saved to newWheatData.csv')
else:
    missing_columns = []
    if 'Variety' not in df.columns:
        missing_columns.append('Variety')
    if 'Phase' not in df.columns:
        missing_columns.append('Phase')
    if 'Attn' not in df.columns:
        missing_columns.append('Attn')
    print(f"Missing column(s) in the DataFrame: {', '.join(missing_columns)}")


Filtered data saved to newWheatData.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


In [10]:
import pandas as pd

GRAIN_TYPE = "Corn"  # Assuming you have defined GRAIN_TYPE somewhere
URL = "../Datasets/processed/" + GRAIN_TYPE + ".csv"
# Read in csv format
df = pd.read_csv(URL)

# Check if the 'Variety' column exists in the DataFrame
if 'Variety' in df.columns:
    # Group the DataFrame by 'Variety' and then calculate min and max for each numeric column
    grouped = df.groupby('Variety').agg(['min', 'max'])

    # Print the result
    print("Minimum and Maximum values for each variety:")
    print(grouped)
else:
    print("'Variety' column not found in the DataFrame.")


Minimum and Maximum values for each variety:
              Unnamed: 0       Freq       d(cm)          M%        Density  \
                     min   max  min   max   min  max    min    max     min   
Variety                                                                      
HI BRED 31D58          0   112  5.0  12.0   3.3  6.5  16.61  31.58  0.4488   
HI BRED 33H82        113   222  5.0  16.0   2.0  8.5   9.47  33.55  0.5261   
HI BRED 33Y74        223   419  5.0  18.0   3.3  8.5   8.06  22.57  0.5741   
HI BRED 34M78        420   630  5.0  18.0   3.3  8.5   8.19  27.26  0.5213   
HI BRED 35F38        631   758  5.0  17.0   3.3  7.7  14.63  23.30  0.5452   
ILLINOIS             773   898  5.0  13.0   3.3  6.5  19.08  31.24  0.5141   
INDIANA              899  1032  5.0  15.0   3.3  7.7  13.91  23.61  0.4763   
KENTUCKY            1033  1147  5.0  16.0   3.3  7.7  11.95  32.71  0.5615   
MISSOURI            1148  1272  5.0  14.0   3.3  8.5  13.09  34.40  0.5154   
NEBRASKA           

In [4]:
import pandas as pd

# Load the data
GRAIN_TYPE = "Corn"  # Assuming you have defined GRAIN_TYPE somewhere
URL = "../Datasets/processed/" + GRAIN_TYPE + ".csv"
df = pd.read_csv(URL)

# Columns for which you want to calculate the average
columns_to_average = [
    'Freq', 'd(cm)', 'M%', 'Density', 'Attn', 'Phase', 
    'Phase_Corr', 'Permittivity_real', 'Permittivity_imaginary'
]

# Group by 'Variety', calculate the mean for the specified columns, and count for each group
aggregation = {col: 'mean' for col in columns_to_average}
aggregation['Variety'] = 'size'
grouped = df.groupby('Variety').agg(aggregation)

# Rename the 'Variety' column to 'Count'
grouped.rename(columns={'Variety': 'Count'}, inplace=True)

# Find the longest variety name for formatting
max_variety_length = max(len(str(variety)) for variety in grouped.index)

# Prepare the header
header_names = ["Variety", "Count"] + columns_to_average
header_str = f"{header_names[0]: <{max_variety_length}}  " + ' '.join([f"{name: >15}" for name in header_names[1:]])
print(header_str)

# Print the results
for variety, row in grouped.iterrows():
    # Format variety names to have equal distance from the first number
    variety_str = f"{variety: <{max_variety_length}}  "
    # Get count and average values, ensure correct formatting
    count_str = f"{row['Count']: >15}"
    avg_values_str = ' '.join([f"{value: >15.3f}" for value in row[columns_to_average]])
    print(variety_str + count_str + avg_values_str)

Variety                      Count            Freq           d(cm)              M%         Density            Attn           Phase      Phase_Corr Permittivity_real Permittivity_imaginary
KANSAS                       178.0         11.253           7.080          15.856           0.767          18.213          -4.896        -619.727           2.778           0.461
NEBRASKA OVERLAND            166.0         10.614           7.156          16.402           0.771          18.767          -1.886        -622.127           2.882           0.507
NEBRASKA SETTLER             164.0         10.409           7.186          16.400           0.820          19.278          -5.802        -651.168           3.034           0.542
OKLAHOMA                     178.0         11.674           7.102          15.353           0.813          16.930          -2.117        -663.465           2.816           0.425
SOUTH DAKOTA                 120.0          9.700           6.857          17.344           0.819   

now we will create a csv file with the column type, that will have the average moisture content and act as the category. And the column Phase / Attn which will have well exactly what the name says


In [9]:
import pandas as pd

# Load the data
GRAIN_TYPE = "Wheat"  # Assuming you have defined GRAIN_TYPE somewhere
URL = "../Datasets/processed/" + GRAIN_TYPE + ".csv"
df = pd.read_csv(URL)
# Calculate the mean M% for each variety
mean_m_per_variety = df.groupby('Variety')['M%'].mean().reset_index()
mean_m_per_variety.rename(columns={'M%': 'Type'}, inplace=True)

# Merge the mean M% back into the original DataFrame
df_with_type = pd.merge(df, mean_m_per_variety, how='left', on='Variety')
# Add a new column 'Phase/Attn' representing phase divided by attn
df_with_type['Phase/Attn'] = df_with_type['Phase'] / df_with_type['Attn']

# Handle potential division by zero or NaN values, if necessary
df_with_type['Phase/Attn'] = df_with_type['Phase/Attn'].replace([float('inf'), -float('inf')], pd.NA)
# Export the updated DataFrame to a new CSV file
df_with_type.to_csv('../Datasets/processed/' + GRAIN_TYPE + 'Added_Type.csv', index=False)
