In [11]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load the dataset using your path
df = pd.read_csv(r"C:\Users\labanya\OneDrive\Documents\Desktop\COLLEGE\PROJECT\DATASET\NASA_airfoil_noise_raw.csv")

# Display the first few rows to understand the structure
print("First few rows of the dataset:")
print(df.head())

# Q1: How many rows are present in the dataset?
rows_original = len(df)
print("Q1. Total rows in the dataset:", rows_original)

# Q2: How many rows are present in the dataset after dropping duplicates?
df_no_duplicates = df.drop_duplicates()
rows_no_duplicates = len(df_no_duplicates)
print("Q2. Rows after dropping duplicates:", rows_no_duplicates)

# Q3: How many rows are present in the dataset after dropping duplicates and rows with null values?
df_cleaned = df_no_duplicates.dropna()
rows_cleaned = len(df_cleaned)
print("Q3. Rows after dropping duplicates and nulls:", rows_cleaned)

# Remove extra spaces in column names (if any)
df_cleaned.columns = df_cleaned.columns.str.strip()

# Check the column names
print("Columns in the dataset after cleaning:")
print(df_cleaned.columns)

# Q4: Rename 'SoundLevel' to 'PressureLevel' if it exists
if 'SoundLevel' in df_cleaned.columns:
    df_cleaned = df_cleaned.rename(columns={'SoundLevel': 'PressureLevel'})
print("Q4. Renamed 'SoundLevel' to 'PressureLevel'")

# Q5: Save cleaned data as Parquet file
parquet_file = 'clean_airfoil_data.parquet'
df_cleaned.to_parquet(parquet_file)
print(f"Q5. Parquet file created: {parquet_file}")

# Q6: Rows in cleaned Parquet dataset
df_parquet = pd.read_parquet(parquet_file)
rows_parquet = len(df_parquet)
print("Q6. Rows in cleaned Parquet dataset:", rows_parquet)

# Q7-Q10: Machine Learning Pipeline Stages

target_column = 'PressureLevel'

if target_column not in df_cleaned.columns:
    print(f"Error: Column '{target_column}' not found in the dataset.")
else:
    # Features and target
    X = df_cleaned.drop(columns=[target_column])
    y = df_cleaned[target_column]

    # Split into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Initialize and train the model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)

    # Q11: Mean Squared Error
    mse = mean_squared_error(y_test, y_pred)
    print("Q11. Mean Squared Error:", mse)

    # Q12: Mean Absolute Error
    mae = mean_absolute_error(y_test, y_pred)
    print("Q12. Mean Absolute Error:", mae)

    # Q13: R Squared
    r2 = r2_score(y_test, y_pred)
    print("Q13. R Squared:", r2)

    # Q14: Intercept
    intercept = model.intercept_
    print("Q14. Intercept:", intercept)

    # Q15: Number of stages in pipeline
    stages = ['Data Preprocessing', 'Model Training', 'Model Evaluation']
    print("Q15. Number of stages in the pipeline:", len(stages))

    # Q16-Q20: Coefficients for each feature
    coefficients = model.coef_
    feature_names = X.columns

    print("\nQ16-Q20. Coefficients for each feature:")
    for feature, coef in zip(feature_names, coefficients):
        print(f"Coefficient for {feature}: {coef}")


First few rows of the dataset:
   Frequency  AngleOfAttack  ChordLength  FreeStreamVelocity  \
0      800.0            0.0       0.3048                71.3   
1     1000.0            0.0       0.3048                71.3   
2     1250.0            0.0       0.3048                71.3   
3     1600.0            0.0       0.3048                71.3   
4     2000.0            0.0       0.3048                71.3   

   SuctionSideDisplacement  SoundLevel  
0                 0.002663     126.201  
1                 0.002663     125.201  
2                 0.002663     125.951  
3                 0.002663     127.591  
4                 0.002663     127.461  
Q1. Total rows in the dataset: 1522
Q2. Rows after dropping duplicates: 1503
Q3. Rows after dropping duplicates and nulls: 1499
Columns in the dataset after cleaning:
Index(['Frequency', 'AngleOfAttack', 'ChordLength', 'FreeStreamVelocity',
       'SuctionSideDisplacement', 'SoundLevel'],
      dtype='object')
Q4. Renamed 'SoundLevel' t