In [76]:
import pandas as pd

# Load the dataset into a pandas dataframe
df = pd.read_csv('CASE1201.ASC.txt', sep='\t')

# Display the first few rows of the dataframe to check the data
df.head()

Unnamed: 0,state sat takers income years public expend rank
0,Iowa 1088 3 326 16.79 87...
1,SouthDakota 1075 2 264 16.07 86...
2,NorthDakota 1068 3 317 16.57 88...
3,Kansas 1045 5 338 16.30 83...
4,Nebraska 1045 5 293 17.25 83...


In [77]:
# Display the number of rows and columns in the dataframe
print("Number of rows and columns:", df.shape)

# Display the column names (features) in the dataframe
print("Column names (features):", df.columns)

# Display the data types of each column in the dataframe
print("Data types of each column:\n", df.dtypes)

# Display basic statistics of the numerical columns in the dataframe
print("Basic statistics of numerical columns:\n", df.describe())

# Display information about the dataframe, including data types and non-null values
print("Information about the dataframe:\n", df.info())

# Check for missing values in the dataframe
print("Missing values:\n", df.isnull().sum())


Number of rows and columns: (50, 1)
Column names (features): Index(['state            sat takers    income years public expend rank'], dtype='object')
Data types of each column:
 state            sat takers    income years public expend rank    object
dtype: object
Basic statistics of numerical columns:
        state            sat takers    income years public expend rank
count                                                  50            
unique                                                 50            
top     Iowa            1088   3        326  16.79  87...            
freq                                                    1            
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 1 columns):
 #   Column                                                          Non-Null Count  Dtype 
---  ------                                                          --------------  ----- 
 0   state            sat takers    income years public ex

In [78]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset into a pandas dataframe
df = pd.read_csv('CASE1201.ASC.txt', delim_whitespace=True)

df.head()

Unnamed: 0,state,sat,takers,income,years,public,expend,rank
0,Iowa,1088,3,326,16.79,87.8,25.6,89.7
1,SouthDakota,1075,2,264,16.07,86.2,19.95,90.6
2,NorthDakota,1068,3,317,16.57,88.3,20.62,89.8
3,Kansas,1045,5,338,16.3,83.9,27.14,86.3
4,Nebraska,1045,5,293,17.25,83.6,21.05,88.5


In [79]:
# Handle missing values (if any)
# For example, you can fill missing values with mean or median using fillna() method
#df = df.drop('state')

df = df.drop('state', axis=1)
df.head()

Unnamed: 0,sat,takers,income,years,public,expend,rank
0,1088,3,326,16.79,87.8,25.6,89.7
1,1075,2,264,16.07,86.2,19.95,90.6
2,1068,3,317,16.57,88.3,20.62,89.8
3,1045,5,338,16.3,83.9,27.14,86.3
4,1045,5,293,17.25,83.6,21.05,88.5


In [80]:
df.fillna(df.mean(), inplace=True)


In [81]:
# Convert categorical variables to numerical representation (if needed)
# For example, you can use label encoding or one-hot encoding to convert categorical variables
# to numerical representation using pandas or scikit-learn

# Split the dataset into features (X) and target (y) variables
X = df.drop('sat', axis=1) # Features are all columns except for 'SAT'
y = df['sat'] # Target variable is 'SAT'

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the first few rows of X_train and y_train to check the split
print("X_train:\n", X_train.head())
print("y_train:\n", y_train.head())

# Display the first few rows of X_test and y_test to check the split
print("X_test:\n", X_test.head())
print("y_test:\n", y_test.head())


X_train:
     takers  income  years  public  expend  rank
12       9     330  15.72    61.2   14.58  83.4
4        5     293  17.25    83.6   21.05  88.5
37      39     255  15.91    80.5   22.62  74.6
8        5     328  16.01    97.0   25.96  87.5
3        5     338  16.30    83.9   27.14  86.3
y_train:
 12     999
4     1045
37     889
8     1017
3     1045
Name: sat, dtype: int64
X_test:
     takers  income  years  public  expend  rank
13       8     316  15.92    79.5   22.19  83.7
39      52     295  16.08    88.8   22.23  72.4
30      40     261  14.48    92.1   30.49  79.3
45      48     258  14.39    90.2   17.93  74.1
17      16     333  16.83    88.3   26.56  81.8
y_test:
 13    997
39    888
30    908
45    860
17    983
Name: sat, dtype: int64


In [93]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load data into a pandas DataFrame
data = {
    'State': ['Iowa', 'SouthDakota', 'NorthDakota', 'Kansas', 'Nebraska', 'Montana', 'Minnesota', 'Utah', 'Wyoming', 'Wisconsin',
              'Oklahoma', 'Arkansas', 'Tennessee', 'NewMexico', 'Idaho', 'Mississippi', 'Kentucky', 'Colorado', 'Washington', 'Arizona',
              'Illinois', 'Louisiana', 'Missouri', 'Michigan', 'WestVirginia', 'Alabama', 'Ohio', 'NewHampshire', 'Alaska', 'Nevada',
              'Oregon', 'Vermont', 'California', 'Delaware', 'Connecticut', 'NewYork', 'Maine', 'Florida', 'Hawaii', 'RhodeIsland',
              'Texas', 'Virginia', 'Iceland'],
    'Total_Score': [1088, 1075, 1068, 1045, 1045, 1033, 1028, 1022, 1017, 1011, 1001, 999, 999, 997, 995, 988, 985, 983, 982, 981, 977,
                    975, 975, 973, 968, 964, 958, 925, 923, 917, 908, 904, 899, 897, 896, 896, 890, 889, 881, 867, 853, 831, 820, 799, 798],
    'Rank': [3, 2, 3, 5, 5, 8, 7, 4, 5, 10, 5, 4, 9, 8, 7, 3, 6, 16, 19, 11, 14, 5, 10, 10, 7, 16, 56, 31, 18, 40, 54, 36, 42, 69, 59,
             46, 39, 39, 20, 17, 12, 21, 23, 34, 37],
    'Math_Score': [326, 264, 317, 338, 293, 263, 343, 333, 328, 304, 358, 295, 330, 316, 285, 315, 330, 333, 309, 314, 347, 394, 322, 335,
                   292, 313, 306, 248, 401, 288, 261, 225, 293, 277, 287, 236, 208, 255, 250, 224, 252, 246, 264, 250, 273],
    'Reading_Score': [16.79, 16.07, 16.57, 16.30, 17.25, 15.91, 17.41, 16.57, 16.01, 16.85, 15.95, 15.49, 15.72, 15.92, 16.18, 16.76, 16.92, 17.04,17.19, 17.27, 16.56, 16.67, 16.91, 16.64, 17.10, 16.40, 16.08, 16.34, 17.05, 16.22, 16.53, 17.13, 16.37, 16.11, 16.42, 16.73, 15.86, 16.06,
16.24, 16.51, 16.60, 16.37, 16.16, 16.66, 15.83, 15.75, 16.14, 16.67, 16.53, 16.22, 16.33],
'Science_Score': [16.04, 15.91, 15.97, 15.95, 16.16, 15.57, 16.00, 16.11, 15.71, 15.73, 15.68, 15.13, 15.51, 15.40, 15.53, 16.00, 15.75, 15.79, 15.90,
 15.75, 16.14, 16.23, 15.75, 16.06, 15.84, 15.68, 15.92, 15.70, 15.56, 15.69, 15.54, 15.59, 15.86, 15.73, 15.50, 15.44, 15.40,
 15.72, 15.62, 15.85, 15.61, 15.44, 15.75, 15.60, 15.53, 15.80, 15.60],
'Writing_Score': [16.36, 16.13, 16.12, 16.02, 16.31, 15.69, 16.13, 16.15, 15.87, 16.10, 16.25, 15.73, 16.02, 15.77, 15.94, 16.27, 16.15, 16.18, 16.12,
16.19, 16.25, 16.21, 16.27, 16.26, 15.96, 16.04, 16.17, 16.10, 15.90, 16.22, 16.11, 16.05, 16.28, 15.84, 16.06, 15.97, 15.75,
 15.93]
}





df = pd.DataFrame(data)

df.head()


ValueError: All arrays must be of the same length

In [90]:
from sklearn.model_selection import train_test_split

# Assuming X is the feature matrix and y is the target variable
X = df.drop('Total_Score', axis=1)  # Features
X.head()
y = df['Total_Score']  # Target variable
y.head()
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


KeyError: "['Total_Score'] not found in axis"