In [1]:
import pandas as pd

# Pandas

In [2]:
users = pd.read_table('./data/user.tbl', sep='|')
ufo = pd.read_csv('./data/ufo.csv')

### Apply

In [11]:
# apply an arbitrary function to each value of a Pandas column,
# storing the result in a new column

users.head()
def under30(num):
    return num < 30

user_is_under_30 = users.loc[:,'age'].apply(under30)
users.loc[user_is_under_30,:]

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
2,3,23,M,writer,32067
3,4,24,M,technician,43537
8,9,29,M,student,01002
11,12,28,F,other,06405
15,16,21,M,entertainment,10309
20,21,26,M,writer,30068
21,22,25,M,writer,40206
23,24,21,F,artist,94533
29,30,7,M,student,55436


In [7]:
under30(50)

False

In [13]:
# apply an arbitrary function to each row of a DataFrame,
# storing the result in a new column

users.loc[:,'under30'] = user_is_under_30
users

Unnamed: 0,user_id,age,gender,occupation,zip_code,under30
0,1,24,M,technician,85711,True
1,2,53,F,other,94043,False
2,3,23,M,writer,32067,True
3,4,24,M,technician,43537,True
4,5,33,F,other,15213,False
5,6,42,M,executive,98101,False
6,7,57,M,administrator,91344,False
7,8,36,M,administrator,05201,False
8,9,29,M,student,01002,True
9,10,53,M,lawyer,90703,False


In [24]:
#applying row wise to dataframe instead of element wise to a series
def over30_or_female(row):
    return row['age']>30 or row['gender'] == "F"

users.loc[:,'over30_or_female'] = users.apply(over30_or_female, axis=1)
users

# compact way to do it:
users.loc[:,'over30_or_female'] = (
    users.apply(lambda row: row['age']>30 or row['gender'] == "F", axis=1)
)
users
#lambda is basically creating function on the fly

Unnamed: 0,user_id,age,gender,occupation,zip_code,under30,over30_or_female
0,1,24,M,technician,85711,True,False
1,2,53,F,other,94043,False,True
2,3,23,M,writer,32067,True,False
3,4,24,M,technician,43537,True,False
4,5,33,F,other,15213,False,True
5,6,42,M,executive,98101,False,True
6,7,57,M,administrator,91344,False,True
7,8,36,M,administrator,05201,False,True
8,9,29,M,student,01002,True,False
9,10,53,M,lawyer,90703,False,True


### String Methods

In [25]:
# Use string methods to change State abbreviations in ufo data to uppercase
ufo.loc[:,'State'].str.lower()

0        ny
1        nj
2        co
3        ks
4        ny
5        nd
6        ca
7        mi
8        ak
9        or
10       ca
11       al
12       sc
13       ia
14       mi
15       ca
16       ca
17       ga
18       tn
19       ak
20       ne
21       la
22       la
23       ky
24       wv
25       ca
26       wv
27       nm
28       nm
29       ut
         ..
80513    nj
80514    ma
80515    va
80516    ca
80517    nh
80518    pa
80519    il
80520    pa
80521    oh
80522    ma
80523    md
80524    wa
80525    ia
80526    ma
80527    wa
80528    oh
80529    wa
80530    fl
80531    va
80532    ma
80533    ia
80534    tx
80535    ky
80536    pa
80537    ne
80538    ne
80539    oh
80540    az
80541    il
80542    fl
Name: State, Length: 80543, dtype: object

In [27]:
# Get a Boolean series that indicates which elements of ufo
# "Colors Reported" column contain the substring "RED"
ufo.loc[:,'Colors Reported'].str.contains('RED', na='False')

0        False
1        False
2        False
3        False
4        False
5        False
6        False
7        False
8        False
9        False
10       False
11       False
12        True
13       False
14       False
15       False
16       False
17       False
18       False
19        True
20       False
21       False
22       False
23       False
24       False
25       False
26       False
27       False
28       False
29       False
         ...  
80513    False
80514    False
80515    False
80516    False
80517    False
80518    False
80519     True
80520    False
80521    False
80522    False
80523    False
80524     True
80525    False
80526    False
80527    False
80528     True
80529    False
80530    False
80531    False
80532    False
80533    False
80534    False
80535    False
80536     True
80537    False
80538    False
80539    False
80540     True
80541     True
80542    False
Name: Colors Reported, Length: 80543, dtype: object

In [28]:
help(str)

Help on class str in module builtins:

class str(object)
 |  str(object='') -> str
 |  str(bytes_or_buffer[, encoding[, errors]]) -> str
 |  
 |  Create a new string object from the given object. If encoding or
 |  errors is specified, then the object must expose a data buffer
 |  that will be decoded using the given encoding and error handler.
 |  Otherwise, returns the result of object.__str__() (if defined)
 |  or repr(object).
 |  encoding defaults to sys.getdefaultencoding().
 |  errors defaults to 'strict'.
 |  
 |  Methods defined here:
 |  
 |  __add__(self, value, /)
 |      Return self+value.
 |  
 |  __contains__(self, key, /)
 |      Return key in self.
 |  
 |  __eq__(self, value, /)
 |      Return self==value.
 |  
 |  __format__(...)
 |      S.__format__(format_spec) -> str
 |      
 |      Return a formatted version of S as described by format_spec.
 |  
 |  __ge__(self, value, /)
 |      Return self>=value.
 |  
 |  __getattribute__(self, name, /)
 |      Return getatt

### Datetimes

In [34]:
# convert a string to the datetime format
ufo.loc[:,'Time'] = pd.to_datetime(ufo.loc[:,'Time'])

In [38]:
# Get ufo Time column in hours
ufo.loc[:,'Time'].apply(type)
ufo.loc[:,'Time'].dt.hour

0        22
1        20
2        14
3        13
4        19
5        15
6         0
7         0
8        17
9         0
10       21
11       20
12       20
13        2
14       13
15       11
16        0
17       22
18        1
19       23
20       15
21        0
22        0
23       11
24       10
25       12
26       12
27       11
28       12
29       10
         ..
80513     8
80514     9
80515    16
80516    17
80517    19
80518    19
80519    20
80520    20
80521    20
80522    20
80523    20
80524    21
80525    21
80526    21
80527    21
80528    21
80529    22
80530    22
80531    22
80532    22
80533    22
80534    22
80535    22
80536    23
80537    23
80538    23
80539     1
80540     2
80541     3
80542     5
Name: Time, Length: 80543, dtype: int64

In [43]:
# Get number of days spanned by events in ufo dataset
ufo.loc[:,'Time'].max() - ufo.loc[:,'Time'].min()

Timedelta('30776 days 07:30:00')

### Changing How DataFrames Are Displayed

In [47]:
# change the maximum number of rows and columns printed
pd.set_option('max_rows',100)
pd.set_option('max_columns', 3)

# Python Style

**Warning:** Everybody likes clean, consistent code; but nobody likes a legalistic zealot.

### PEP8

[PEP8](https://www.python.org/dev/peps/pep-0008/) is the standard style guide for Python.

- Use 4-space indentation
- Keep line lengths below 80 characters.
    - Prefer splitting inside (), [], {} instead of using \\.
    - Put operator at the start of the new line instead of the end of the old line.

Yes:

```python
my_new_variable = (old_variable1
                   * old_variable2
                   + old_variable3
                   - old_variable4
                   - old_variable5)

my_new_variable = (
    old_variable1 * old_variable2 + old_variable3 - old_variable4 - old_variable5
    )

my_new_variable = (old_variable1 * old_variable2 + old_variable3
                   - old_variable4 - old_variable5)
```

No:

```python
my_new_variable = old_variable1 * old_variable2 + old_variable3 - old_variable4  - old_variable5

my_new_variable = old_variable1\
                  * old_variable2\
                  + old_variable3\
                  - old_variable4\
                  - old_variable5
```

- Creating strings with either single or double-quotes is fine. If your string contains quotes of one type, use the other type to create it to that you don't have to use backslashes in the strings.

Yes:

```python
'dog'
"cat"
"can't"
'She said "run!"'
```

No:

```python
'can\'t'
"She said \"run!\""
```

- [Whitespace rules](https://www.python.org/dev/peps/pep-0008/#id27)
- Always surround these binary operators with a single space on either side: assignment (=), augmented assignment (+=, -= etc.), comparisons (==, <, >, !=, <>, <=, >=, in, not in, is, is not), Booleans (and, or, not).
- Don't use spaces around the = sign when used to indicate a keyword argument or a default parameter value.

Yes:
```python
def complex(real, imag=0.0):
    return magic(r=real, i=imag)
```

No:
```python
def complex(real, imag = 0.0):
    return magic(r = real, i = imag)
```

### LowClass Python

[This style guide](http://columbia-applied-data-science.github.io/pages/lowclass-python-style-guide.html) is meant for data scientists and others who write code but aren't exactly professional programmers.

- Write functions that take well-defined inputs and produce well-defined output.
- Do not have multiple levels of nesting within a function. As soon as you drop down to a lower level of abstraction, create a helper function.

```python
def extract_feature_counts(data_string):
    """
    Some docstring here...
    """
    cleaned_data_string = _clean(data_string)
    word_counts = _count_words(cleaned_data_string)

    return word_counts


def _clean(data_string):
    # Some code here.
    return cleaned_data_string


def _count_words(data_string):
    # Some code here.
    return word_counts
```

**Notice:**

- A leading underscore in a function name indicates that the function is "private," meaning that you don't intend for anyone else to use it.

### General Principles of Software Design

1. Run all the tests.
2. Eliminate duplication.
3. Express your intent.
4. Minimize the number of classes, methods, and functions.

### Additional Resources

- [Google Style Guide](http://google.github.io/styleguide/pyguide.html)
- [Clean Code](https://www.amazon.com/Clean-Code-Handbook-Software-Craftsmanship/dp/0132350882)
- [The Pragmatic Programmer](https://www.amazon.com/Pragmatic-Programmer-Journeyman-Master/dp/020161622X/ref=pd_sim_14_3?_encoding=UTF8&pd_rd_i=020161622X&pd_rd_r=KAF4G8CGK7T9PFT998E2&pd_rd_w=u0wda&pd_rd_wg=KHmIe&psc=1&refRID=KAF4G8CGK7T9PFT998E2)

# Jupyter Notebooks, REPLs, Text Editors, and IDEs

### REPLs

- REPL = Read-Evaluate-Print Loop.

ipython a little nicer than python
-exit() to get out of python
-exit to get out of ipython

### Jupyter Notebooks

- Jupyter Notebooks are browser-based frontends to REPL sessions (using the `iPython` kernel for Python).
- Ability to mix text and media with code is powerful for documentation and reporting.
- Ability to run cells out of order is a double-edged sword.
    - Great for exploration and presentation.
    - Doesn't enforce reproducibility.

### Text Editors


##### Options

- **Too simple:** Notepad, TextEdit, Nano



- **Powerful but demanding:** Emacs, Vim
    


- **Approachable and moderately powerful:** Sublime Text, Atom



### IDEs

##### Options

- Rodeo (similar to RStudio)
- Spyder (similar to Matlab)
- PyCharm (aimed at Python developers)



##### Opinionated Advice

- **For this class:** Use Jupyter.
- **For quick one-off tasks:** Use `ipython`.
- **When writing code to run repeatedly:** Move to Atom.
- **When building Python packages:** Move to PyCharm.
- **At some point:**
    - Learn enough Vim to not embarrass yourself. If you like it, enable Vim keybindings elsewhere.
    - Try out Emacs, Rodeo, and Spyder.

# Practice

Work on `lesson07_exploratory_data_analysis/practice/eda-data_cleaning_intro-lab-master/pandas-cleaning-apply.ipynb` in pairs using the driver/navigator approach. One person (the "driver") writes the code (sharing his or her screen) while the other person (the "navigator") continually makes suggestions and reviews the code. The driver should talk about what he or she is doing, ask for input, and generally keep the navigator engaged.

Many professional programmers swear by this "pair programming" approach to software development.

After half of our time is up, I will have you switch roles and work on `lesson06_experiments_and_hypothesis_testing/practice/eda-telecomm_group_project-lab-master/telecomm-eda-group-lab.ipynb`.

If you have already worked on one or both of these notebooks, that's totally fine. Be the driver for the one that you have worked on less, and review what you have already done with your partner before you extend it. If you are the navigator for a notebook you have already worked on, don't pull out your code. Let your partner take the lead and give feedback on the direction they take.

# Final Project Part 1 due Thurs.

https://git.generalassemb.ly/datr1618/course-info#projects

# Unit Project 2 due next week Tues.

https://git.generalassemb.ly/datr1618/unit_project2

# Questions?

# Exit Ticket

```
=========================================
@channel
Exit Ticket: https://goo.gl/forms/OUw4gyTiRKMOTI3t2        

#feedback
=========================================
```