# Introduction to DataFrames
- Pandas is basically built on top of numpy and matplotlib.
- Pandas is designed to work with rectangular data such as in 2D numpy array.
- This rectangular data is represented as the DataFrame object.
## Exploring DataFrame
1. .head() -> Returns first few rows of DataFrame.
2. .info() -> displays names of columns, datatypes and so on.
3. .shape -> Returns rows and columns.
4. .describe() -> Computes some summary statistics.
5. .values, .columns and .index

In [2]:
import pandas as pd

students = pd.read_csv(r'C:\Users\Dell\OneDrive\Desktop\KaranCodes\Datacampcourses\Associate-Data-Scientist-Python-Track\resources\DatamanipulationwithPandas-datasets\students.csv')

# View the top 7 rows of the students data
print(students.head(7))

# Print concise summary of students
print(students.info())

# Check the size (rows, columns) of students
print(students.shape)

# Show the statistical description of numerical columns in students
print(students.describe())

   student_id first_name last_name  age  gender             major  gpa  \
0           1       John       Doe   20    Male  Computer Science  3.5   
1           2       Emma     Smith   22  Female          Business  3.8   
2           3       Noah   Johnson   19    Male       Engineering  3.2   
3           4        Ava  Williams   21  Female       Mathematics  3.7   
4           5       Liam     Brown   20    Male           Physics  3.1   
5           6     Olivia     Jones   23  Female        Psychology  3.9   
6           7      Mason    Garcia   18    Male           Biology  2.9   

   credits_completed  
0                 60  
1                 90  
2                 45  
3                 80  
4                 55  
5                100  
6                 30  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   student_id         5

In [3]:
# Print the values of homelessness
print(students.values)

# Print the column index of homelessness
print(students.columns)

# Print the row index of homelessness
print(students.index)

[[1 'John' 'Doe' 20 'Male' 'Computer Science' 3.5 60]
 [2 'Emma' 'Smith' 22 'Female' 'Business' 3.8 90]
 [3 'Noah' 'Johnson' 19 'Male' 'Engineering' 3.2 45]
 [4 'Ava' 'Williams' 21 'Female' 'Mathematics' 3.7 80]
 [5 'Liam' 'Brown' 20 'Male' 'Physics' 3.1 55]
 [6 'Olivia' 'Jones' 23 'Female' 'Psychology' 3.9 100]
 [7 'Mason' 'Garcia' 18 'Male' 'Biology' 2.9 30]
 [8 'Sophia' 'Miller' 22 'Female' 'History' 3.4 85]
 [9 'James' 'Davis' 21 'Male' 'Economics' 3.3 75]
 [10 'Isabella' 'Martinez' 20 'Female' 'Political Science' 3.6 65]
 [11 'Ethan' 'Lopez' 22 'Male' 'Chemistry' 3.0 70]
 [12 'Mia' 'Gonzalez' 19 'Female' 'Literature' 3.5 50]
 [13 'Lucas' 'Wilson' 20 'Male' 'Mechanical Engineering' 2.8 55]
 [14 'Charlotte' 'Anderson' 21 'Female' 'Education' 3.7 80]
 [15 'Logan' 'Thomas' 23 'Male' 'Computer Science' 3.6 95]
 [16 'Amelia' 'Taylor' 18 'Female' 'Business' 3.2 40]
 [17 'Alexander' 'Moore' 20 'Male' 'Engineering' 3.1 60]
 [18 'Evelyn' 'Jackson' 21 'Female' 'Mathematics' 3.8 85]
 [19 'Eli

## Sorting rows
- We can sort rows with sort_values() method.

In [7]:
# Sort students by first_name
students_ind = students.sort_values("first_name")

# Print the top few rows
print(students_ind.head())

    student_id first_name last_name  age  gender        major  gpa  \
21          22    Abigail     White   20  Female      History  3.3   
16          17  Alexander     Moore   20    Male  Engineering  3.1   
15          16     Amelia    Taylor   18  Female     Business  3.2   
3            4        Ava  Williams   21  Female  Mathematics  3.7   
29          30      Avery     Young   20  Female     Business  3.8   

    credits_completed  
21                 65  
16                 60  
15                 40  
3                  80  
29                 70  


In [6]:
# Sort Students by descending gpa
student_grades = students.sort_values("gpa", ascending=False)

print(student_grades.head())

    student_id first_name last_name  age  gender        major  gpa  \
5            6     Olivia     Jones   23  Female   Psychology  3.9   
19          20     Harper       Lee   22  Female   Psychology  3.9   
33          34   Scarlett     Scott   19  Female   Psychology  3.9   
29          30      Avery     Young   20  Female     Business  3.8   
17          18     Evelyn   Jackson   21  Female  Mathematics  3.8   

    credits_completed  
5                 100  
19                 95  
33                 45  
29                 70  
17                 85  


### Subsetting multiple rows

In [9]:
# Sort student_info by gpa, then descending credits_completed
student_info = students.sort_values(["gpa", "credits_completed"],ascending=[True,False])

# Print the top few rows
print(student_info.head())

    student_id first_name last_name  age gender                   major  gpa  \
34          35     Joseph     Green   21   Male                 Biology  2.7   
18          19     Elijah    Martin   19   Male                 Physics  2.7   
12          13      Lucas    Wilson   20   Male  Mechanical Engineering  2.8   
38          39       Owen      Hill   19   Male               Chemistry  2.8   
44          45     Julian     Gomez   22   Male             Engineering  2.9   

    credits_completed  
34                 75  
18                 35  
12                 55  
38                 40  
44                 90  


## Subsetting Columns
- To subset columns we use square brackets and column name which is only required.

In [10]:
# Select the credits_completed column
credits = students["credits_completed"]

print(credits.head())

0    60
1    90
2    45
3    80
4    55
Name: credits_completed, dtype: int64


In [11]:
student_info = students[["gpa", "credits_completed"]]

# Print the top few rows
print(student_info.head())

   gpa  credits_completed
0  3.5                 60
1  3.8                 90
2  3.2                 45
3  3.7                 80
4  3.1                 55


## Use relational operators to subset or filter row

In [13]:
# Filter for rows where individuals is greater than 10000
topper_students = students[students["gpa"] > 3.8]

# See the result
print(topper_students)

    student_id first_name last_name  age  gender       major  gpa  \
5            6     Olivia     Jones   23  Female  Psychology  3.9   
19          20     Harper       Lee   22  Female  Psychology  3.9   
33          34   Scarlett     Scott   19  Female  Psychology  3.9   

    credits_completed  
5                 100  
19                 95  
33                 45  


In [15]:
adult_male_student = students[(students["age"]>20) & (students["gender"]=="Male") & (students["major"]=="Engineering")]

print(adult_male_student)

    student_id first_name last_name  age gender        major  gpa  \
30          31     Samuel     Allen   21   Male  Engineering  3.4   
44          45     Julian     Gomez   22   Male  Engineering  2.9   

    credits_completed  
30                 80  
44                 90  


## Subsetting rows by categorical variables
- Subsetting data based on a categorical variable often uses the or operator (|) to select rows from multiple categories.
- A better alternative is to use the .isin() method, which simplifies the condition.
- With .isin(), we can check if values belong to a list of categories in one condition instead of writing multiple separate conditions.

In [3]:
import pandas as pd

# Load the students dataset
students = pd.read_csv(r'C:\Users\Dell\OneDrive\Desktop\KaranCodes\Datacampcourses\Associate-Data-Scientist-Python-Track\resources\DatamanipulationwithPandas-datasets\students.csv')

# Define science-related majors
science_majors = ["Computer Science", "Physics", "Biology", "Chemistry", "Engineering", "Mathematics"]

# Filter for students who are in science majors
science_students = students[students["major"].isin(science_majors)]

# Show the result
print(science_students.head())

   student_id first_name last_name  age  gender             major  gpa  \
0           1       John       Doe   20    Male  Computer Science  3.5   
2           3       Noah   Johnson   19    Male       Engineering  3.2   
3           4        Ava  Williams   21  Female       Mathematics  3.7   
4           5       Liam     Brown   20    Male           Physics  3.1   
6           7      Mason    Garcia   18    Male           Biology  2.9   

   credits_completed  
0                 60  
2                 45  
3                 80  
4                 55  
6                 30  


## New columns
### Adding new columns

- We can add new columns to a DataFrame, we're not limited to the existing data.
- This process is also known as transforming, mutating, or feature engineering.
- New columns can be:
  - Created from scratch.
  - Derived from existing columns by:
    - Adding columns together.
    - Changing the units of existing data.

In [5]:
# Add a new column 'performance_score' as GPA multiplied by credits_completed
students["performance_score"] = students["gpa"] * students["credits_completed"]

# Add a new column 'p_credits' as proportion of credits completed out of 120 (assuming 120 credits needed for graduation)
students["p_credits"] = students["credits_completed"] / 120

# See the result
print(students.head())

   student_id first_name last_name  age  gender             major  gpa  \
0           1       John       Doe   20    Male  Computer Science  3.5   
1           2       Emma     Smith   22  Female          Business  3.8   
2           3       Noah   Johnson   19    Male       Engineering  3.2   
3           4        Ava  Williams   21  Female       Mathematics  3.7   
4           5       Liam     Brown   20    Male           Physics  3.1   

   credits_completed  performance_score  p_credits  
0                 60              210.0   0.500000  
1                 90              342.0   0.750000  
2                 45              144.0   0.375000  
3                 80              296.0   0.666667  
4                 55              170.5   0.458333  


### Multiple manipulations

In [7]:
# Create gpa_per_credit column as GPA divided by credits_completed
students["gpa_per_credit"] = students["gpa"] / students["credits_completed"]

# Subset rows where gpa_per_credit is greater than 0.05
high_efficiency_students = students[students["gpa_per_credit"] > 0.05]

# Sort high_efficiency_students by descending gpa_per_credit
high_efficiency_sorted = high_efficiency_students.sort_values("gpa_per_credit", ascending=False)

# From high_efficiency_sorted, select first_name, last_name, and gpa_per_credit columns
result = high_efficiency_sorted[["first_name", "last_name", "gpa_per_credit"]]

# See the result
print(result.head())

   first_name last_name  gpa_per_credit
27      Sofia  Robinson        0.116667
6       Mason    Garcia        0.096667
33   Scarlett     Scott        0.086667
15     Amelia    Taylor        0.080000
18     Elijah    Martin        0.077143
