# (1) Foundations:

In [1]:
# Useful snippets
#------Colab:
from IPython import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
#------Mount Google Drive:
from google.colab import drive
drive.mount('/content/drive')
#------PWD & Py Version:
%pwd
%cd drive/My Drive/Works
!python --version

Mounted at /content/drive
/content/drive/My Drive/Works
Python 3.10.12


In [2]:
# Basic Packs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

---------------NUMPY & PANDAS----------------

In [3]:
# Q1. Play Time...
pd.__version__
np.__version__

'2.1.4'

'1.26.4'

In [5]:
# Q2. Data & Records...
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/laptops.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2160 entries, 0 to 2159
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Laptop        2160 non-null   object 
 1   Status        2160 non-null   object 
 2   Brand         2160 non-null   object 
 3   Model         2160 non-null   object 
 4   CPU           2160 non-null   object 
 5   RAM           2160 non-null   int64  
 6   Storage       2160 non-null   int64  
 7   Storage type  2118 non-null   object 
 8   GPU           789 non-null    object 
 9   Screen        2156 non-null   float64
 10  Touch         2160 non-null   object 
 11  Final Price   2160 non-null   float64
dtypes: float64(2), int64(2), object(8)
memory usage: 202.6+ KB


In [7]:
# Q3. Laptop Brands
df['Brand'].nunique()

27

In [8]:
# Q4. Missing Values
df.isnull().sum()

Unnamed: 0,0
Laptop,0
Status,0
Brand,0
Model,0
CPU,0
RAM,0
Storage,0
Storage type,42
GPU,1371
Screen,4


In [9]:
# Q5. Maximum Final price of Dell Notebooks in df
df[df['Brand'] == 'Dell']['Final Price'].max()

3936.0

In [19]:
# Q6. Median of Screen column Fill up with Mode and get the difference
df['Screen'].median()
df['Screen'].fillna(df['Screen'].mode()[0], inplace=True)
df['Screen'].median()

15.6

15.6

In [21]:
# Q7. Sum of weights
# Select all the "Innjoo" laptops from the dataset.
# Select only columns RAM, Storage, Screen.
# Get the underlying NumPy array. Let's call it X.
# Compute matrix-matrix multiplication between the transpose of X and X. To get the transpose, use X.T. Let's call the result XTX.
# Compute the inverse of XTX.
# Create an array y with values [1100, 1300, 800, 900, 1000, 1100].
# Multiply the inverse of XTX with the transpose of X, and then multiply the result by y. Call the result w.
# What's the sum of all the elements of the result?
X = df[df['Brand'] == 'Innjoo'][['RAM', 'Storage', 'Screen']]
XTX = X.T.dot(X)
y = np.array([1100, 1300, 800, 900, 1000, 1100])
w = np.linalg.inv(XTX).dot(X.T).dot(y)
w
w.sum()

array([45.58076606,  0.42783519, 45.29127938])

91.2998806299555

# (2) SPARK

In [None]:
#install java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [None]:
#create java home variable
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

In [None]:
#download SPARK (NEW DOWNLOAD LINK)
!wget -q http://apache.osuosl.org/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz

In [None]:
#extract the spark file to the current folder
!tar xf spark-3.3.1-bin-hadoop3.tgz

In [None]:
#create spark home variable
import os
os.environ["SPARK_HOME"] = "/content/spark-3.3.1-bin-hadoop3"

In [None]:
#install findspark
#findspark searches pyspark installation on the server
#and adds PySpark installation path to sys.path at runtime
#so that PySpark modules can be imported

!pip install -q findspark

In [None]:
#import findspark
import findspark
findspark.init()

In [None]:
#import pyspark (added by findspark during runtime)
import pyspark

In [None]:
#import sparksession
from pyspark.sql import SparkSession

In [None]:
#create sparksession object and provide appName
spark=SparkSession.builder.appName("local[*]").getOrCreate()

In [None]:
#print spark version
print("Apache Spark version: ", spark.version)

Apache Spark version:  3.3.1


In [None]:
#create sample data for spark
data = [("James","","Smith",30,"M",60000),
        ("Michael","Rose","",50,"M",70000),
        ("Robert","","Williams",42,"",400000),
        ("Maria","Anne","Jones",38,"F",500000),
        ("Jen","Mary","Brown",45,"F",0)]

columns = ["first_name","middle_name","last_name","Age","gender","salary"]
pysparkDF = spark.createDataFrame(data = data, schema = columns)
pysparkDF.printSchema()
pysparkDF.show(truncate=False)

root
 |-- first_name: string (nullable = true)
 |-- middle_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- Age: long (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)

+----------+-----------+---------+---+------+------+
|first_name|middle_name|last_name|Age|gender|salary|
+----------+-----------+---------+---+------+------+
|James     |           |Smith    |30 |M     |60000 |
|Michael   |Rose       |         |50 |M     |70000 |
|Robert    |           |Williams |42 |      |400000|
|Maria     |Anne       |Jones    |38 |F     |500000|
|Jen       |Mary       |Brown    |45 |F     |0     |
+----------+-----------+---------+---+------+------+



In [None]:
from pyspark.sql.functions import mean, col, max
#Example 1
df2=pysparkDF.select(mean("age"),mean("salary")).show()
#Example 2
pysparkDF.groupBy("gender") \
         .agg(mean("age"),mean("salary"),max("salary")) \
         .show()

+--------+-----------+
|avg(age)|avg(salary)|
+--------+-----------+
|    41.0|   206000.0|
+--------+-----------+

+------+--------+-----------+-----------+
|gender|avg(age)|avg(salary)|max(salary)|
+------+--------+-----------+-----------+
|     M|    40.0|    65000.0|      70000|
|     F|    41.5|   250000.0|     500000|
|      |    42.0|   400000.0|     400000|
+------+--------+-----------+-----------+

