In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import statsmodels as sm

In [2]:
data = [np.random.standard_normal() for i in range(7)]
data

[2.047918176697641,
 0.1706016701309743,
 0.23445703307146082,
 -0.11372154592660066,
 1.1161431326176083,
 -1.2222249304130302,
 0.4986280250307278]

In [5]:
c = """
This is a longer string that
spans multiple lines
"""
print("line = ",c.count("\n"))

line =  3


In [6]:
template = "{0:.2f} {1:s} are worth US${2:d}"
template.format(88.46, "Argentine Pesos", 1)

'88.46 Argentine Pesos are worth US$1'

{0:.2f} means to format the first argument as a floating-point number with two decimal places.

{1:s} means to format the second argument as a string.

{2:d} means to format the third argument as an exact integer.

In [8]:
val = "español"
val_utf8 = val.encode("utf-8")
print("val_utf8 =", val_utf8, "type =", type(val_utf8))
print("decoded =", val_utf8.decode("utf-8"))

print(val.encode("latin1"))
print(val.encode("utf-16"))
print(val.encode("utf-16le"))

val_utf8 = b'espa\xc3\xb1ol' type = <class 'bytes'>
decoded = español
b'espa\xf1ol'
b'\xff\xfee\x00s\x00p\x00a\x00\xf1\x00o\x00l\x00'
b'e\x00s\x00p\x00a\x00\xf1\x00o\x00l\x00'


In [None]:
sequence = [1, 2, None, 4, None, 5]
total = 0
for value in sequence:
    if value is None:
        continue
    total += value

You can advance a for loop to the next iteration, skipping the remainder of the block, using the continue keyword. Consider this code, which sums up integers in a list and skips None values:

In [None]:
sequence = [1, 2, 0, 4, 6, 5, 2, 1]
total_until_5 = 0
for value in sequence:
    if value == 5:
        break
    total_until_5 += value

The break keyword only terminates the innermost for loop; any outer for loops will continue to run:

In [None]:
for i in range(4):
    for j in range(4):
        if j > i:
            break
        print((i, j))

In [None]:
if x < 0:
    print("negative!")
elif x == 0:
    # TODO: put something smart here
    pass
else:
    print("positive!")

pass is the “no-op” (or "do nothing") statement in Python. It can be used in blocks where no action is to be taken (or as a placeholder for code not yet implemented); it is only required because Python uses whitespace to delimit blocks:

In [None]:
print(list(range(0, 20, 2)))
print(list(range(5, 0, -1)))

In [None]:
import re

def clean_strings(strings):
    result = []
    for value in strings:
        value = value.strip()
        value = re.sub("[!#?]", "", value)
        value = value.title()
        result.append(value)
    return result

the re standard library module for regular expressions

In [None]:
def attempt_float(x):
    try:
        return float(x)
    except (TypeError, ValueError):
        return x

attempt_float("1.2345")
attempt_float("something")

In [None]:
f = open(path, mode="w")
def write_to_file(file, df):
    pd.df.to_csv(file)

try:
    write_to_file(f)
except:
    print("Failed")
else:
    print("Succeeded")
finally:
    f.close()

- r	Read-only mode
- w	Write-only mode; creates a new file (erasing the data for any file with the same name)
- x	Write-only mode; creates a new file, but fails if the file path already exists
- a	Append to existing file (create the file if it does not already exist)
- r+	Read and write
- b	Add to mode for binary files (i.e., "rb" or "wb")
- t	Text mode for files (automatically decoding bytes to Unicode). This is the default if not specified.

In [9]:
my_arr = np.arange(1_000_000)
my_list = list(range(1_000_000))

%timeit my_arr2 = my_arr * 2
%timeit my_list2 = [x * 2 for x in my_list]

3.17 ms ± 396 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
129 ms ± 6.46 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
x = np.array([[1., 2., 3.], [4., 5., 6.]])
y = np.array([[6., 23.], [-1, 7], [8, 9]])
print(x.dot(y))
print(np.dot(x, y))

x.dot(y) is equivalent to np.dot(x, y)

In [11]:
sdata = {"Ohio": 35000, "Texas": 71000, "Oregon": 16000, "Utah": 5000}
obj3 = pd.Series(sdata)
print(obj3)
obj3.to_dict()
print(obj3)

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64
Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64


In [12]:
data = {"state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],
        "year": [2000, 2001, 2002, 2001, 2002, 2003],
        "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
frame2 = pd.DataFrame(data, columns=["year", "state", "pop", "debt"])
frame2.to_numpy()
print(frame2)

   year   state  pop debt
0  2000    Ohio  1.5  NaN
1  2001    Ohio  1.7  NaN
2  2002    Ohio  3.6  NaN
3  2001  Nevada  2.4  NaN
4  2002  Nevada  2.9  NaN
5  2003  Nevada  3.2  NaN


In [19]:
frame = pd.DataFrame(np.random.standard_normal((4, 3)), 
                    columns=list("bde"),
                    index=["Utah", "Ohio", "Texas", "Oregon"])
print(frame)

def f1(x):
    return x.max() - x.min()

print(frame.apply(f1))
print(frame.apply(f1, axis="columns"))

def my_format(x):
    return f"{x:.2f}"

print(frame.applymap(my_format))

               b         d         e
Utah   -0.408715 -1.036801 -0.578201
Ohio    0.364924  0.006157 -0.244444
Texas   1.370354 -0.255821 -0.090652
Oregon  0.793403 -0.966789 -0.948804
b    1.779069
d    1.042958
e    0.858152
dtype: float64
Utah      0.628086
Ohio      0.609368
Texas     1.626176
Oregon    1.760192
dtype: float64
            b      d      e
Utah    -0.41  -1.04  -0.58
Ohio     0.36   0.01  -0.24
Texas    1.37  -0.26  -0.09
Oregon   0.79  -0.97  -0.95


In [21]:
import os

# file = os.path.abspath(r"C:\datasets\Line assignment.xlsx")
# df = pd.read_excel(file, sheet_name='StageOverview')

file_y_price = os.path.abspath(r"C:\datasets\yahoo_price.pkl")
file_y_volume = os.path.abspath(r"C:\datasets\yahoo_volume.pkl")

price = pd.read_pickle(file_y_price)
volume = pd.read_pickle(file_y_volume)

returns = price.pct_change()

print("correlation:", returns["MSFT"].corr(returns["IBM"]))
print("covarience:", returns["MSFT"].cov(returns["IBM"]))

correlation: 0.49976361144151155
covarience: 8.870655479703546e-05


In [None]:
returns.corr()