In [None]:
import pandas as pd
df = pd.read_csv('Water Quality Testing.csv')
print(df.head())
print(df.describe())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set the aesthetic style of the plots
sns.set(style='whitegrid')

# Plotting the distribution of pH values
plt.figure(figsize=(10, 6), facecolor='white')
sns.histplot(df['pH'], kde=True, color='blue', bins=30)
plt.title('Distribution of pH Values')
plt.xlabel('pH')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.figure(figsize=(10, 6), facecolor='white')
plt.scatter(df['pH'], df['Dissolved Oxygen (mg/L)'], color='green', alpha=0.5)
plt.title('Scatter Plot of pH vs. Dissolved Oxygen')
plt.xlabel('pH')
plt.ylabel('Dissolved Oxygen (mg/L)')
plt.grid(True)
plt.show()

In [None]:
import numpy as np

# Creating a scatter plot with a trend line
plt.figure(figsize=(10, 6), facecolor='white')
plt.scatter(df['pH'], df['Dissolved Oxygen (mg/L)'], color='green', alpha=0.5)

# Calculating the trend line
z = np.polyfit(df['pH'], df['Dissolved Oxygen (mg/L)'], 1)
p = np.poly1d(z)
plt.plot(df['pH'], p(df['pH']), "r--")

plt.title('Scatter Plot of pH vs. Dissolved Oxygen with Trend Line')
plt.xlabel('pH')
plt.ylabel('Dissolved Oxygen (mg/L)')
plt.grid(True)
plt.show()

In [None]:
# Calculating the correlation coefficient between pH and Dissolved Oxygen
correlation = df['pH'].corr(df['Dissolved Oxygen (mg/L)'])
print('Correlation coefficient between pH and Dissolved Oxygen:', correlation)

In [None]:
import pandas as pd
from sklearn import linear_model
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import statsmodels.api as sma
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
water_quality_data = pd.read_csv('Water Quality Testing.csv')
water_quality_data

In [None]:
water_quality_data.describe()

In [None]:
water_quality_data.info()

In [None]:
reg = linear_model.LinearRegression()
reg.fit(water_quality_data[['pH', 'Temperature (°C)', 'Turbidity (NTU)', 'Dissolved Oxygen (mg/L)']], water_quality_data['Conductivity (µS/cm)'])

In [None]:
reg.intercept_

In [None]:
reg.coef_

In [None]:
reg_pred = reg.intercept_ + reg.coef_
reg_pred

In [None]:
plt.plot(reg_pred, marker='x')
plt.show()

In [18]:
a = water_quality_data['pH']
b = water_quality_data['Temperature (°C)']

In [19]:
a = np.array(a)
b = np.array(b)

In [20]:
a = sma.add_constant(a)

In [21]:
model = sma.OLS(b, a).fit()

In [None]:
print(model.summary())

In [None]:
sns.set_style('whitegrid')
sns.pairplot(water_quality_data, kind='scatter', height=3.5)

In [None]:
plt.scatter(x=water_quality_data['Dissolved Oxygen (mg/L)'], y=water_quality_data['Conductivity (µS/cm)'], marker='x')
plt.show()

In [None]:
sns.set_style('whitegrid')
sns.pairplot(water_quality_data[['pH', 'Temperature (°C)', 'Turbidity (NTU)', 'Dissolved Oxygen (mg/L)', 'Conductivity (µS/cm)']], hue='Conductivity (µS/cm)')

In [None]:
sns.relplot(water_quality_data, x='Turbidity (NTU)', y='Dissolved Oxygen (mg/L)')

In [None]:
sns.regplot(water_quality_data, x='Turbidity (NTU)', y='Dissolved Oxygen (mg/L)')

In [None]:
sns.set_style("darkgrid")
sns.histplot(water_quality_data, x='pH')

In [None]:
sns.lmplot(water_quality_data, x='Temperature (°C)', y='Conductivity (µS/cm)', line_kws={'color':'black'})

In [None]:
sns.lmplot(water_quality_data, x='pH', y='Turbidity (NTU)', scatter_kws={'color' : 'grey'}, line_kws={'color' : 'black'})

In [31]:
a = water_quality_data['Turbidity (NTU)']
b = water_quality_data['Dissolved Oxygen (mg/L)']

In [32]:
features = ['Turbidity (NTU)', 'Dissolved Oxygen (mg/L)']
a_train = water_quality_data[features]
a_test = water_quality_data[features]
b_train = water_quality_data['Conductivity (µS/cm)']

In [33]:
regre = linear_model.LinearRegression()
regre.fit(a_train, b_train)
prediction = regre.predict(a_test)

In [None]:
prediction

In [None]:
print(prediction.shape)

In [None]:
submission = water_quality_data[['Sample ID']]
submission

In [None]:
submission['Conductivity (µS/cm)'] = prediction

In [None]:
submission

In [None]:
submission = water_quality_data[['Sample ID', 'Turbidity (NTU)', 'Dissolved Oxygen (mg/L)']]
submission

In [None]:
submission['Conductivity (µS/cm)'] = prediction

In [None]:
submission

In [None]:
sns.lmplot(submission, x='Turbidity (NTU)', y='Conductivity (µS/cm)')

In [43]:
p = submission['Turbidity (NTU)']
q = submission['Conductivity (µS/cm)']

In [44]:
p = np.array(p)
q = np.array(q)

In [45]:
p = sma.add_constant(p)

In [46]:
model = sma.OLS(q, p).fit()

In [None]:
print(model.summary())

In [48]:
columns = ['pH', 'Temperature (°C)', 'Turbidity (NTU)', 'Dissolved Oxygen (mg/L)']

In [49]:
a_train = water_quality_data[columns]
a_test = water_quality_data[columns]
b_train = water_quality_data['Conductivity (µS/cm)']

In [50]:
Regression = linear_model.LinearRegression()
Regression.fit(a_train, b_train)
predictions = Regression.predict(a_test)

In [None]:
predictions

In [None]:
water_quality_data['Prediction of Conductivity (µS/cm)'] = predictions
water_quality_data