# **1. Importing Libraries**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
from google.colab import files

# **2. Uploading and Loading Data**

In [None]:
# Upload Excel file
uploaded = files.upload()

# Get filename
data_file = list(uploaded.keys())[0]

# Load the Excel file into a DataFrame
data = pd.read_excel(data_file)

# **3. Function For Data Cleaning**
viz., Data Cleaning, Converting Data Types, Cleaning Specific Columns

In [None]:
# Set future behavior option
pd.set_option('future.no_silent_downcasting', True)

# Function to clean intensity values
def clean_intensity(value):
    if isinstance(value, str):
        if value == 'not in range':
            return np.nan
        elif '<' in value:
            try:
                return float(value.replace('<', '').strip())
            except ValueError:
                return np.nan
    return value

# Replace non-numeric entries with NaN
data.replace(['-', '#VALUE!', '<3200', 'not in range'], np.nan, inplace=True)

# Drop rows with NaN values
data.dropna(inplace=True)

# Convert relevant columns to numeric
data['PL intensity'] = pd.to_numeric(data['PL intensity'], errors='coerce')
data['Plwave'] = pd.to_numeric(data['Plwave'], errors='coerce')

# Clean the 'PL intensity' column
data['PL intensity'] = data['PL intensity'].apply(clean_intensity)

# Replace NaN with the median of 'PL intensity'
median_intensity = data['PL intensity'].median()
data['PL intensity'] = data['PL intensity'].fillna(median_intensity)

# Drop any remaining NaN values after conversion
data.dropna(inplace=True)


# **Feature Selection and Splitting Data**

In [None]:

# Define features and target variable
X = data[['PL intensity', 'Plwave']]  # Input features
y = data['Bandgap']                    # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## **1. PL Intensity and Bandgap Relationship:**
The photoluminescence (PL) intensity provides insight into the material's quality and purity. Higher PL intensity generally suggests that the material has fewer impurities and defects. This is because high PL intensity indicates minimal non-radiative recombination centers, which are often caused by defects or impurities. In a high-quality perovskite material, the PL intensity will be strong and stable, meaning the material can maintain its bandgap without significant energy loss.

*Dependency*: By maximizing PL intensity, you can infer a stable and efficient bandgap, which is especially important for applications like solar cells. Higher intensity often correlates with a cleaner synthesis process and fewer defects in the perovskite ink, meaning the material can more efficiently convert light into electricity without performance degradation.



## **2. PL Wavelength and Bandgap Relationship:**

 PL wavelength (or peak position) is often directly associated with the bandgap energy of the material. Shorter wavelengths correspond to higher bandgap energies, while longer wavelengths indicate a smaller bandgap.

*Dependency:*        For applications requiring a specific bandgap (e.g., ~1.5 eV for solar cells), you can target formulations that produce PL wavelengths correlating to that energy level. This lets you fine-tune the bandgap by altering the chemical composition (e.g., halide or cation variations) to yield the optimal wavelength and corresponding bandgap.

# **Model Training and Making Predictions**

In [None]:
# Initialize the Linear Regression model
model = LinearRegression()

# Train the model using the training data
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# **Model Evaluation and Model Coefficients**

In [None]:
# Evaluate model performance
mae = mean_absolute_error(y_test, y_pred)  # Calculate Mean Absolute Error
r2 = r2_score(y_test, y_pred)                # Calculate R² Score

# Print evaluation metrics
print(f'Mean Absolute Error (MAE): {mae}')
print(f'R² Score: {r2}')

# Display model coefficients to understand the relationship between features and target variable
coefficients = pd.DataFrame({'Feature': X.columns, 'Coefficient': model.coef_})
print('\nModel Coefficients:')
print(coefficients)

High PL intensity: typically has fewer non-radiative recombination centers, meaning it can maintain an ideal bandgap without significant losses.

PL intensity acts as a quality check, confirming that the ink has minimal impurities, which indirectly supports the stability and efficiency of the desired bandgap.

## **A strong, positive relationship between PL intensity and bandgap means stable and efficient bandgap.**

PL Wavelength is typically inversely related to the bandgap energy. Shorter wavelengths indicate higher energy (and thus a higher bandgap), while longer wavelengths suggest a lower bandgap.

PL wavelength mainly helps fine-tune the actual energy level of the bandgap, essential for hitting specific energy targets like ~1.5 eV.

PL Wavelength values help identify specific features of the sample's absorption spectrum, such as peak absorption points or cutoff wavelengths.


## **PL wavelength increases, the bandgap decreases.**


# **Function to display intensity and wavelength values**

In [None]:
import pandas as pd

# Function to display intensity, wavelength, and bandgap values side by side with counts
def display_values(data):
    # Create a new DataFrame with intensity, wavelength, and bandgap values
    combined_data = pd.DataFrame({
        'PL Intensity': data['PL intensity'],
        'PL Wavelength': data['Plwave'],
        'Bandgap': data['Bandgap']  # Assuming your bandgap data is in a column named 'Bandgap'
    })

    # Count of each value
    count_intensity = len(data['PL intensity'])
    count_wavelength = len(data['Plwave'])
    count_bandgap = len(data['Bandgap'])  # Count for bandgap

    print("Combined PL Intensity, Wavelength, and Bandgap Values:")

    # Adjust column spacing and print combined data without index
    print(combined_data.to_string(index=False, col_space=20))  # Adjusting col_space for spacing

      # Calculate average bandgap
    average_bandgap = data['Bandgap'].mean()  # Calculate average of the bandgap values

    # Print counts
    print(f"\nCount of PL Intensity Values: {count_intensity}")
    print(f"Count of PL Wavelength Values: {count_wavelength}")
    print(f"Average Bandgap Value: {average_bandgap:.3f}")  # Print average with 3 decimal places


# Call the function to display values
display_values(data)


# **Visualization Function**

# *i) Scatter Plot of Actual vs. Predicted Values*

In [None]:
import matplotlib.pyplot as plt

# Make predictions for all data
y_all_pred = model.predict(X)  # Predictions for all rows in the dataset

# Create a combined DataFrame for visualization
combined_df = pd.DataFrame({'Actual': y, 'Predicted': y_all_pred})

# Create scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(combined_df['Actual'], combined_df['Predicted'], alpha=0.6, edgecolors='k', label='All Data Points')
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--', lw=2, label='Perfect Prediction Line')
plt.xlabel('Actual Bandgap')
plt.ylabel('Predicted Bandgap')
plt.title('Actual vs Predicted Bandgap (All Data Points)')
plt.legend()
plt.grid()
plt.show()


The bandgap is a fundamental property that determines how the material conducts electricity and absorbs light. It indicates the energy difference between the valence and conduction bands.

## **Predicted vs. Actual Bandgap Plot:** The scatter plot comparing predicted and actual bandgap values shows how accurately the model can capture the relationships within the data. If points are close to the identity line (where predicted values equal actual values), it implies the model’s reliability.

## **Inference:** Consistency along the identity line shows our model is a useful predictor for bandgap values based on PL intensity and wavelength. If there is significant deviation, further data refinement or feature adjustments might be necessary for more precise predictions.

# *ii) Residual Plot*

In [None]:
# Calculate residuals
residuals = y - y_all_pred

# Create residual plot
plt.figure(figsize=(10, 6))
plt.scatter(y_all_pred, residuals, alpha=0.6, edgecolors='k')
plt.axhline(0, color='red', linestyle='--', label='Zero Error Line')
plt.xlabel('Predicted Bandgap')
plt.ylabel('Residuals')
plt.title('Residuals vs Predicted Bandgap')
plt.legend()
plt.grid()
plt.show()


# *iii) Coefficient plot*

In [None]:
# Display model coefficients
coefficients = pd.DataFrame({'Feature': X.columns, 'Coefficient': model.coef_})
# Plotting the coefficients
plt.figure(figsize=(10, 6))
plt.barh(coefficients['Feature'], coefficients['Coefficient'], color='skyblue')
plt.xlabel('Coefficient Value')
plt.title('Feature Coefficients from Linear Regression')
plt.grid()
plt.show()



# *iv)  Correlation heatmap*

In [None]:
import seaborn as sns

# Create correlation heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()


# *v) Density Plot Of Actual and Predicted Values*

In [None]:
import seaborn as sns

plt.figure(figsize=(10, 6))

# Kernel Density Estimate for actual bandgap
sns.kdeplot(y, label='Actual Bandgap', color='blue', fill=True, alpha=0.5)

# Kernel Density Estimate for predicted bandgap
sns.kdeplot(y_all_pred, label='Predicted Bandgap', color='orange', fill=True, alpha=0.5)

plt.xlabel('Bandgap')
plt.ylabel('Density')
plt.title('Distribution of Actual vs Predicted Bandgap (Density Plot)')
plt.legend()
plt.grid()
plt.show()


# *v) Function to Predict Bandgap for New Ink Parameters*

In [None]:
# Function to predict bandgap for new ink parameters
def predict_bandgap(pl_intensity, pl_wave):
    new_data = pd.DataFrame({'PL intensity': [pl_intensity], 'Plwave': [pl_wave]})
    predicted_bandgap = model.predict(new_data)

    # Print input parameters and predicted bandgap
    print(f"Input Parameters:\nPL Intensity: {pl_intensity}\nPL Wavelength: {pl_wave}")
    print(f"Predicted Bandgap: {predicted_bandgap[0]}")

    return predicted_bandgap[0]

# Example prediction
optimal_pl_intensity = 5000  # Example value
optimal_pl_wave = 800.0  # Example value
predicted_bandgap = predict_bandgap(optimal_pl_intensity, optimal_pl_wave)

# Make predictions for all data
y_all_pred = model.predict(X)  # Predictions for all rows in the dataset

# Create a scatter plot for actual vs predicted
plt.figure(figsize=(10, 6))
plt.scatter(y, y_all_pred, alpha=0.6, edgecolors='k', label='All Data Points')

# Plot the predicted bandgap for the new input parameters
plt.scatter(predicted_bandgap, predicted_bandgap, color='green', s=100, edgecolor='black', label='Predicted Bandgap (New Input)')

# Perfect prediction line
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--', lw=2, label='Perfect Prediction Line')

plt.xlabel('Actual Bandgap')
plt.ylabel('Predicted Bandgap')
plt.title('Actual vs Predicted Bandgap (Including New Prediction)')
plt.legend()
plt.grid()
plt.show()

In [None]:
import pickle
# Save model to a file
with open('trained_model.pkl', 'wb') as file:
    pickle.dump(model, file)
