In [10]:
import pandas as pd


df = pd.DataFrame({'A': ['aa ', 'bb  12', 'cc 3', 'dd  456', 'ee 789  ']})

def remove_trailing_space_or_digit(text):
  """
  Removes trailing spaces or digits from a string.

  Args:
    text: The string to modify.

  Returns:
    The modified string with trailing spaces or digits removed.
  """
  while text and (text[-1].isspace() or text[-1].isdigit()):
    text = text[:-1]
  return text

# Apply the lambda function to the 'A' column
df['A'] = df['A'].apply(remove_trailing_space_or_digit)

print(df)


    A
0  aa
1  bb
2  cc
3  dd
4  ee


Here are several approaches you can consider for modeling your data in Python with 40 quantitative inputs and 2 quantitative outputs:

1. Supervised Learning with Regularization:

Linear Regression: A classic approach for modeling linear relationships between features and outputs. Use regularization techniques like Ridge or Lasso to handle large numbers of features and potential collinearity.
Support Vector Regression (SVR): Effective for non-linear relationships and robust to outliers.
Decision Tree Regression: Can capture complex relationships without feature engineering. Consider Random Forest Regression for improved stability.
2. Neural Networks:

Multi-Layer Perceptron (MLP): A deep learning model adaptable to diverse relationships. Fine-tune hyperparameters like the number of layers and neurons.
Long Short-Term Memory (LSTM) Networks: Effective for analyzing sequential data if your inputs have inherent order or time dependence.
3. Feature Engineering and Selection:

Dimensionality Reduction: Use techniques like Principal Component Analysis (PCA) to reduce the number of features, especially if computational resources are limited.
Feature Selection: Identify the most relevant features using techniques like correlation analysis or information gain.
4. Evaluation and Improvement:

Metrics: Choose appropriate metrics like Mean Squared Error (MSE) or R-squared to evaluate model performance.
Cross-validation: Split your data and validate your model's generalizability.
Hyperparameter Tuning: Experiment with different hyperparameters for each model to improve performance.
Additional points:

Data Normalization or Scaling: Normalize or scale your input features to ensure they have similar scales, especially if using distance-based algorithms like SVR.
Missing Data Handling: Check for missing values and consider imputation techniques.
Software Recommendations: Popular libraries include scikit-learn, TensorFlow, PyTorch, and Keras. Choose the one that suits your experience and project requirements.

I have 4 columns of quantitative inputs and 2 columns of quantitative outputs in a pandas dataframe. I need a way to model the data in python.
Can you give me some code (with sample data) for Long Short-Term Memory (LSTM) Networks?

In [11]:
#Polynomial regression with PCA

import pandas as pd
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

# Sample data with 4 quantitative inputs and 2 quantitative outputs
data = pd.read_csv('step07_FUStats.csv')

# Create a Pandas DataFrame
df = pd.DataFrame(data)

df = df.dropna()
X = df[['AFSeed', 'AUSeed', 'AdjEM_x', 
       'AdjO_x', 'AdjD_x', 'AdjT_x', 'SOSEM_x', 'SOSO_x', 'SOSD_x', 'BPI(O)_x',
       'BPI(D)_x', 'W_x', 'L_x', 'Pts_x', 'Opp_x', 'MOV_x', 'SOS_x', 'OSRS_x',
       'DSRS_x', 'SRS_x', 'PASE_x', 'AdjEM_y', 'AdjO_y', 'AdjD_y',
       'AdjT_y', 'SOSEM_y', 'SOSO_y', 'SOSD_y', 'BPI(O)_y', 'BPI(D)_y', 'W_y',
       'L_y', 'Pts_y', 'Opp_y', 'MOV_y', 'SOS_y', 'OSRS_y', 'DSRS_y', 'SRS_y', 'PASE_y']]
y = df[['AFScore', 'AUScore']]

# Use PCA to reduce the dimensionality of the dataset
pca = PCA(n_components=14)
inputs = pca.fit_transform(X)

# Use polynomial modeling to fit the data
poly = PolynomialFeatures(degree=4)
inputs_poly = poly.fit_transform(inputs)
model = LinearRegression().fit(inputs_poly, y)

# Generate new data and corresponding outputs

new_inputs = pca.transform(X)
new_inputs_poly = poly.transform(new_inputs)
new_outputs = model.predict(new_inputs_poly)

# Print the new data and corresponding outputs

check = pd.concat([X,y, pd.DataFrame(new_outputs, columns=['output1', 'output2'])], axis=1)
check['real'] = check['AFScore'] - check['AUScore']
check['predicted'] = check['output1'] - check['output2']
check = check.dropna()

correct = (check['real'] * check['predicted'] > 0).sum()
incorrect = (check['real'] * check['predicted'] < 0).sum()

print(correct,incorrect)

770 219


In [12]:
# Linear Regression

import pandas as pd
from sklearn.linear_model import MultiTaskElasticNet

# Sample data with 4 quantitative inputs and 2 quantitative outputs
data = pd.read_csv('step07_FUStats.csv')

# Create a Pandas DataFrame
df = pd.DataFrame(data)

df = df.dropna()
X = df[['AFSeed', 'AUSeed', 'AdjEM_x', 
       'AdjO_x', 'AdjD_x', 'AdjT_x', 'SOSEM_x', 'SOSO_x', 'SOSD_x', 'BPI(O)_x',
       'BPI(D)_x', 'W_x', 'L_x', 'Pts_x', 'Opp_x', 'MOV_x', 'SOS_x', 'OSRS_x',
       'DSRS_x', 'SRS_x', 'PASE_x', 'AdjEM_y', 'AdjO_y', 'AdjD_y',
       'AdjT_y', 'SOSEM_y', 'SOSO_y', 'SOSD_y', 'BPI(O)_y', 'BPI(D)_y', 'W_y',
       'L_y', 'Pts_y', 'Opp_y', 'MOV_y', 'SOS_y', 'OSRS_y', 'DSRS_y', 'SRS_y', 'PASE_y']]
y = df[['AFScore', 'AUScore']]

# Create a MultiTaskElasticNet model with regularization
model = MultiTaskElasticNet(alpha=0.1)  # Adjust alpha for regularization strength

# Fit the model to the training data
model.fit(X, y)

# Make predictions on the testing data
y_pred = model.predict(X)

# Print the new data and corresponding outputs

check = pd.concat([X,y, pd.DataFrame(y_pred, columns=['output1', 'output2'])], axis=1)
check['real'] = check['AFScore'] - check['AUScore']
check['predicted'] = check['output1'] - check['output2']
check = check.dropna()

correct = (check['real'] * check['predicted'] > 0).sum()
incorrect = (check['real'] * check['predicted'] < 0).sum()

print(correct,incorrect)

701 288


In [13]:
# Support Vector Regression

import pandas as pd
from sklearn.svm import SVR

# Sample data with 4 quantitative inputs and 2 quantitative outputs
data = pd.read_csv('step07_FUStats.csv')

# Create a Pandas DataFrame
df = pd.DataFrame(data)

df = df.dropna()
X = df[['AFSeed', 'AUSeed', 'AdjEM_x', 
       'AdjO_x', 'AdjD_x', 'AdjT_x', 'SOSEM_x', 'SOSO_x', 'SOSD_x', 'BPI(O)_x',
       'BPI(D)_x', 'W_x', 'L_x', 'Pts_x', 'Opp_x', 'MOV_x', 'SOS_x', 'OSRS_x',
       'DSRS_x', 'SRS_x', 'PASE_x', 'AdjEM_y', 'AdjO_y', 'AdjD_y',
       'AdjT_y', 'SOSEM_y', 'SOSO_y', 'SOSD_y', 'BPI(O)_y', 'BPI(D)_y', 'W_y',
       'L_y', 'Pts_y', 'Opp_y', 'MOV_y', 'SOS_y', 'OSRS_y', 'DSRS_y', 'SRS_y', 'PASE_y']]
winner = df[['AFScore']]
looser = df[['AUScore']]

# Create SVR models for each output
model1 = SVR()
model2 = SVR()

# Fit models to the training data
model1.fit(X, winner)
model2.fit(X, looser)

# Make predictions on the testing data
y1_pred = model1.predict(X)
y2_pred = model2.predict(X)

# Print the new data and corresponding outputs

check = pd.concat([X,winner,looser, pd.DataFrame({'output1':y1_pred,'output2':y2_pred})], axis=1)
check['real'] = check['AFScore'] - check['AUScore']
check['predicted'] = check['output1'] - check['output2']
check = check.dropna()

correct = (check['real'] * check['predicted'] > 0).sum()
incorrect = (check['real'] * check['predicted'] < 0).sum()

print(correct,incorrect)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


712 277


In [14]:
# Decision Tree Regression

import pandas as pd
from sklearn.tree import DecisionTreeRegressor


# Sample data with 4 quantitative inputs and 2 quantitative outputs
data = pd.read_csv('step07_FUStats.csv')

# Create a Pandas DataFrame
df = pd.DataFrame(data)

df = df.dropna()
X = df[['AFSeed', 'AUSeed', 'AdjEM_x', 
       'AdjO_x', 'AdjD_x', 'AdjT_x', 'SOSEM_x', 'SOSO_x', 'SOSD_x', 'BPI(O)_x',
       'BPI(D)_x', 'W_x', 'L_x', 'Pts_x', 'Opp_x', 'MOV_x', 'SOS_x', 'OSRS_x',
       'DSRS_x', 'SRS_x', 'PASE_x', 'AdjEM_y', 'AdjO_y', 'AdjD_y',
       'AdjT_y', 'SOSEM_y', 'SOSO_y', 'SOSD_y', 'BPI(O)_y', 'BPI(D)_y', 'W_y',
       'L_y', 'Pts_y', 'Opp_y', 'MOV_y', 'SOS_y', 'OSRS_y', 'DSRS_y', 'SRS_y', 'PASE_y']]
y = df[['AFScore', 'AUScore']]


# Create and tune the model
model = DecisionTreeRegressor()

# Fit the model to the training data
model.fit(X, y)

# Make predictions on the testing data
y_pred = model.predict(X)

# Print the new data and corresponding outputs

check = pd.concat([X,y, pd.DataFrame(y_pred, columns=['output1', 'output2'])], axis=1)
check['real'] = check['AFScore'] - check['AUScore']
check['predicted'] = check['output1'] - check['output2']
check = check.dropna()

correct = (check['real'] * check['predicted'] > 0).sum()
incorrect = (check['real'] * check['predicted'] < 0).sum()

print(correct,incorrect)

770 218


In [15]:
# MultiLayer Perception

import pandas as pd
from sklearn.neural_network import MLPRegressor

# Sample data with 4 quantitative inputs and 2 quantitative outputs
data = pd.read_csv('step07_FUStats.csv')

# Create a Pandas DataFrame
df = pd.DataFrame(data)

df = df.dropna()
X = df[['AFSeed', 'AUSeed', 'AdjEM_x', 
       'AdjO_x', 'AdjD_x', 'AdjT_x', 'SOSEM_x', 'SOSO_x', 'SOSD_x', 'BPI(O)_x',
       'BPI(D)_x', 'W_x', 'L_x', 'Pts_x', 'Opp_x', 'MOV_x', 'SOS_x', 'OSRS_x',
       'DSRS_x', 'SRS_x', 'PASE_x', 'AdjEM_y', 'AdjO_y', 'AdjD_y',
       'AdjT_y', 'SOSEM_y', 'SOSO_y', 'SOSD_y', 'BPI(O)_y', 'BPI(D)_y', 'W_y',
       'L_y', 'Pts_y', 'Opp_y', 'MOV_y', 'SOS_y', 'OSRS_y', 'DSRS_y', 'SRS_y', 'PASE_y']]
y = df[['AFScore', 'AUScore']]

mlp = MLPRegressor(
    hidden_layer_sizes=(10, 5),  # Two hidden layers with 10 and 5 neurons
    activation='relu',  # ReLU activation function
    solver='adam',  # Adam optimizer
    max_iter=500  # Maximum number of iterations
)

# Fit the model to the training data
mlp.fit(X, y)

# Make predictions on the testing data
y_pred = mlp.predict(X)

# Print the new data and corresponding outputs

check = pd.concat([X,y, pd.DataFrame(y_pred, columns=['output1', 'output2'])], axis=1)
check['real'] = check['AFScore'] - check['AUScore']
check['predicted'] = check['output1'] - check['output2']
check = check.dropna()

correct = (check['real'] * check['predicted'] > 0).sum()
incorrect = (check['real'] * check['predicted'] < 0).sum()

print(correct,incorrect)

676 313




1. Deep Neural Networks (DNNs):

Pros: Powerful and flexible, can capture complex non-linear relationships, suitable for large datasets.
Cons: Can be computationally expensive, might require data pre-processing and hyperparameter tuning, black-box nature makes interpretation challenging.

2. Gradient Boosting Machines (GBMs):

Pros: Handles complex interactions between features, robust to outliers, good interpretability through feature importance scores.
Cons: May not perform as well as DNNs on very complex datasets, hyperparameter tuning is important.

3. Transformer-based models:

Pros: State-of-the-art for sequential data and natural language processing, potentially good for capturing long-range dependencies between features.
Cons: Relatively new technology, computationally expensive, requires large datasets to train effectively.

4. AutoML tools:

Pros: Automate model selection and hyperparameter tuning, convenient for non-experts.
Cons: Limited interpretability, black-box nature makes understanding the model less transparent.

5. Multi-Output Learning Techniques:

Pros: Specifically designed for handling multiple outputs, various approaches available like multi-task learning, ensemble methods with shared representations.
Cons: Complexity depends on the chosen technique, understanding interactions between outputs might be challenging.

I have 4 columns of quantitative inputs and 2 columns of quantitative outputs in a pandas dataframe. I need a way to model the data in python.
Can you give me some code (with sample data) for Long Short-Term Memory (LSTM) Networks?

In [16]:
# Transformer Based Models
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder


# Sample data with 4 quantitative inputs and 2 quantitative outputs
data = pd.read_csv('step07_FUStats.csv')

# Create a Pandas DataFrame
df = pd.DataFrame(data)

df = df.dropna()
X = df[['AFSeed', 'AUSeed', 'AdjEM_x', 
       'AdjO_x', 'AdjD_x', 'AdjT_x', 'SOSEM_x', 'SOSO_x', 'SOSD_x', 'BPI(O)_x',
       'BPI(D)_x', 'W_x', 'L_x', 'Pts_x', 'Opp_x', 'MOV_x', 'SOS_x', 'OSRS_x',
       'DSRS_x', 'SRS_x', 'PASE_x', 'AdjEM_y', 'AdjO_y', 'AdjD_y',
       'AdjT_y', 'SOSEM_y', 'SOSO_y', 'SOSD_y', 'BPI(O)_y', 'BPI(D)_y', 'W_y',
       'L_y', 'Pts_y', 'Opp_y', 'MOV_y', 'SOS_y', 'OSRS_y', 'DSRS_y', 'SRS_y', 'PASE_y']]
y = df[['AFScore', 'AUScore']]

# Define transformers
numeric_transformer = StandardScaler()


# Specify which columns are numeric and categorical
numeric_features = ['AFSeed', 'AUSeed', 'AdjEM_x', 
       'AdjO_x', 'AdjD_x', 'AdjT_x', 'SOSEM_x', 'SOSO_x', 'SOSD_x', 'BPI(O)_x',
       'BPI(D)_x', 'W_x', 'L_x', 'Pts_x', 'Opp_x', 'MOV_x', 'SOS_x', 'OSRS_x',
       'DSRS_x', 'SRS_x', 'PASE_x', 'AdjEM_y', 'AdjO_y', 'AdjD_y',
       'AdjT_y', 'SOSEM_y', 'SOSO_y', 'SOSD_y', 'BPI(O)_y', 'BPI(D)_y', 'W_y',
       'L_y', 'Pts_y', 'Opp_y', 'MOV_y', 'SOS_y', 'OSRS_y', 'DSRS_y', 'SRS_y', 'PASE_y']


# Create the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
    ])

# Fit and transform the data
X_transformed = preprocessor.fit_transform(X)

# Convert the transformed data back to a pandas dataframe
X = pd.DataFrame(X_transformed, columns=numeric_features)

# Create and tune the model
model = DecisionTreeRegressor()

# Fit the model to the training data
model.fit(X, y)

# Make predictions on the testing data
y_pred = model.predict(X)

# Print the new data and corresponding outputs

check = pd.concat([X,y, pd.DataFrame(y_pred, columns=['output1', 'output2'])], axis=1)
check['real'] = check['AFScore'] - check['AUScore']
check['predicted'] = check['output1'] - check['output2']
check = check.dropna()

correct = (check['real'] * check['predicted'] > 0).sum()
incorrect = (check['real'] * check['predicted'] < 0).sum()

print(correct,incorrect)


770 218


In [17]:
# Gradient Boosting
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor


# Sample data with 4 quantitative inputs and 2 quantitative outputs
data = pd.read_csv('step07_FUStats.csv')

# Create a Pandas DataFrame
df = pd.DataFrame(data)

df = df.dropna()
X = df[['AFSeed', 'AUSeed', 'AdjEM_x', 
       'AdjO_x', 'AdjD_x', 'AdjT_x', 'SOSEM_x', 'SOSO_x', 'SOSD_x', 'BPI(O)_x',
       'BPI(D)_x', 'W_x', 'L_x', 'Pts_x', 'Opp_x', 'MOV_x', 'SOS_x', 'OSRS_x',
       'DSRS_x', 'SRS_x', 'PASE_x', 'AdjEM_y', 'AdjO_y', 'AdjD_y',
       'AdjT_y', 'SOSEM_y', 'SOSO_y', 'SOSD_y', 'BPI(O)_y', 'BPI(D)_y', 'W_y',
       'L_y', 'Pts_y', 'Opp_y', 'MOV_y', 'SOS_y', 'OSRS_y', 'DSRS_y', 'SRS_y', 'PASE_y']]
y1 = df['AFScore']
y2 = df['AUScore']

# Create and fit GBM models
model1 = GradientBoostingRegressor(n_estimators=100, learning_rate=1.0, max_depth=1)
model1.fit(X, y1)

model2 = GradientBoostingRegressor(n_estimators=100, learning_rate=1.0, max_depth=1)
model2.fit(X, y2)

# Make predictions
y_pred1 = model1.predict(X)
y_pred2 = model2.predict(X)

# Print the new data and corresponding outputs

check = pd.concat([X,y1,y2, pd.DataFrame(y_pred, columns=['output1', 'output2'])], axis=1)
check['real'] = check['AFScore'] - check['AUScore']
check['predicted'] = check['output1'] - check['output2']
check = check.dropna()

correct = (check['real'] * check['predicted'] > 0).sum()
incorrect = (check['real'] * check['predicted'] < 0).sum()

print(correct,incorrect)


770 218


In [18]:
# MultiTaskElasticNet
import pandas as pd
from sklearn.linear_model import MultiTaskElasticNet



# Sample data with 4 quantitative inputs and 2 quantitative outputs
data = pd.read_csv('step07_FUStats.csv')

# Create a Pandas DataFrame
df = pd.DataFrame(data)

df = df.dropna()
X = df[['AFSeed', 'AUSeed', 'AdjEM_x', 
       'AdjO_x', 'AdjD_x', 'AdjT_x', 'SOSEM_x', 'SOSO_x', 'SOSD_x', 'BPI(O)_x',
       'BPI(D)_x', 'W_x', 'L_x', 'Pts_x', 'Opp_x', 'MOV_x', 'SOS_x', 'OSRS_x',
       'DSRS_x', 'SRS_x', 'PASE_x', 'AdjEM_y', 'AdjO_y', 'AdjD_y',
       'AdjT_y', 'SOSEM_y', 'SOSO_y', 'SOSD_y', 'BPI(O)_y', 'BPI(D)_y', 'W_y',
       'L_y', 'Pts_y', 'Opp_y', 'MOV_y', 'SOS_y', 'OSRS_y', 'DSRS_y', 'SRS_y', 'PASE_y']]
y = df[['AFScore', 'AUScore']]

# Create and train MultiTaskElasticNet model
model = MultiTaskElasticNet(alpha=0.1, l1_ratio=0.5)
model.fit(X, y)

# Make predictions
y_pred = model.predict(X)

# Print the new data and corresponding outputs

check = pd.concat([X,y, pd.DataFrame(y_pred, columns=['output1', 'output2'])], axis=1)
check['real'] = check['AFScore'] - check['AUScore']
check['predicted'] = check['output1'] - check['output2']
check = check.dropna()

correct = (check['real'] * check['predicted'] > 0).sum()
incorrect = (check['real'] * check['predicted'] < 0).sum()

print(correct,incorrect)

701 288


In [19]:
# RandonForestRegressor
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

# Sample data with 4 quantitative inputs and 2 quantitative outputs
data = pd.read_csv('step07_FUStats.csv')

# Create a Pandas DataFrame
df = pd.DataFrame(data)

df = df.dropna()
X = df[['AFSeed', 'AUSeed', 'AdjEM_x', 
       'AdjO_x', 'AdjD_x', 'AdjT_x', 'SOSEM_x', 'SOSO_x', 'SOSD_x', 'BPI(O)_x',
       'BPI(D)_x', 'W_x', 'L_x', 'Pts_x', 'Opp_x', 'MOV_x', 'SOS_x', 'OSRS_x',
       'DSRS_x', 'SRS_x', 'PASE_x', 'AdjEM_y', 'AdjO_y', 'AdjD_y',
       'AdjT_y', 'SOSEM_y', 'SOSO_y', 'SOSD_y', 'BPI(O)_y', 'BPI(D)_y', 'W_y',
       'L_y', 'Pts_y', 'Opp_y', 'MOV_y', 'SOS_y', 'OSRS_y', 'DSRS_y', 'SRS_y', 'PASE_y']]
y = df[['AFScore', 'AUScore']]

# Create and train Multi-Output Random Forest model
model = RandomForestRegressor(n_estimators=100)
model.fit(X, y)

# Make predictions
y_pred = model.predict(X)

# Print the new data and corresponding outputs

check = pd.concat([X,y, pd.DataFrame(y_pred, columns=['output1', 'output2'])], axis=1)
check['real'] = check['AFScore'] - check['AUScore']
check['predicted'] = check['output1'] - check['output2']
check = check.dropna()

correct = (check['real'] * check['predicted'] > 0).sum()
incorrect = (check['real'] * check['predicted'] < 0).sum()

print(correct,incorrect)

748 240


In [20]:
# MultiOutputRegressor
import pandas as pd
from sklearn.multioutput import MultiOutputRegressor

# Sample data with 4 quantitative inputs and 2 quantitative outputs
data = pd.read_csv('step07_FUStats.csv')

# Create a Pandas DataFrame
df = pd.DataFrame(data)

df = df.dropna()
X = df[['AFSeed', 'AUSeed', 'AdjEM_x', 
       'AdjO_x', 'AdjD_x', 'AdjT_x', 'SOSEM_x', 'SOSO_x', 'SOSD_x', 'BPI(O)_x',
       'BPI(D)_x', 'W_x', 'L_x', 'Pts_x', 'Opp_x', 'MOV_x', 'SOS_x', 'OSRS_x',
       'DSRS_x', 'SRS_x', 'PASE_x', 'AdjEM_y', 'AdjO_y', 'AdjD_y',
       'AdjT_y', 'SOSEM_y', 'SOSO_y', 'SOSD_y', 'BPI(O)_y', 'BPI(D)_y', 'W_y',
       'L_y', 'Pts_y', 'Opp_y', 'MOV_y', 'SOS_y', 'OSRS_y', 'DSRS_y', 'SRS_y', 'PASE_y']]
y = df[['AFScore', 'AUScore']]

# Define the model
model = MultiOutputRegressor(LinearRegression())
model.fit(X, y)

# Make predictions
y_pred = model.predict(X)

# Print the new data and corresponding outputs

check = pd.concat([X,y, pd.DataFrame(y_pred, columns=['output1', 'output2'])], axis=1)
check['real'] = check['AFScore'] - check['AUScore']
check['predicted'] = check['output1'] - check['output2']
check = check.dropna()

correct = (check['real'] * check['predicted'] > 0).sum()
incorrect = (check['real'] * check['predicted'] < 0).sum()

print(correct,incorrect)

707 282
